Merge remote-tracking branch 'origin/master' into tfl/gpu_add_nvrtc

a903a001 · fenglei.tian · ee220ffb · ce3670b5 · a903a001 · a903a001
Commit a903a001 authored Feb 26, 2018 by fenglei.tian
49 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,7 @@ output/
 *.mpg
 *.cpio
 *.wav
+*.backup
 doc/source/generated
 .cache/
 nervana_aeon.egg-info/

--- a/cmake/external_argon_transformer.cmake
+++ b/cmake/external_argon_transformer.cmake
@@ -30,20 +30,24 @@ if (NGRAPH_ARGON_ENABLE)
    endif()
    # Repository
-    set(ARGON_TRANSFORMER_CMAKE_GIT_REPOSITORY git@github.com:NervanaSystems/argon-transformer.git)
+    if (DEFINED CUSTOM_ARGON_TRANSFORMER_GIT_REPOSITORY)
+        set(ARGON_TRANSFORMER_GIT_REPOSITORY ${CUSTOM_ARGON_TRANSFORMER_GIT_REPOSITORY})
+    else()
+        set(ARGON_TRANSFORMER_GIT_REPOSITORY git@github.com:NervanaSystems/argon-transformer.git)
+    endif()
    # Set argon_transformer tag
    # Notes:
    # - Before we have ngraph CI job for argon transformer, ngraph master might not be
    #   compatible with argon transformer. To ensure compatibility, checkout the ngraph commit point
-    #   where the following `ARGON_TRANSFORMER_CMAKE_GIT_TAG` is set and build ngraph with argon using this
+    #   where the following `ARGON_TRANSFORMER_GIT_TAG` is set and build ngraph with argon using this
    #   commit.
    # - After we have ngraph CI job for argon transformer, ngraph master will be compatible with
    #   argon transformer guaranteed by CI.
-    set(ARGON_TRANSFORMER_CMAKE_GIT_TAG cpp-master)
+    set(ARGON_TRANSFORMER_GIT_TAG cpp-master)
    # Determines where argon-transformer will be located
-    set(ARGON_TRANSFORMER_CMAKE_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/argon_transformer)
+    set(ARGON_TRANSFORMER_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/argon_transformer)
    # Print
    message(STATUS "NGRAPH_INCLUDE_PATH: ${NGRAPH_INCLUDE_PATH}")
@@ -56,27 +60,35 @@ if (NGRAPH_ARGON_ENABLE)
            ExternalProject_Add(
                ext_argon_transformer
                SOURCE_DIR ${CUSTOM_ARGON_TRANSFORMER_DIR}
-                PREFIX ${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                PREFIX ${ARGON_TRANSFORMER_PREFIX}
                UPDATE_COMMAND ""
-                CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                CMAKE_ARGS
-                        -DNGRAPH_INSTALL_PREFIX=${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DNGRAPH_INSTALL_PREFIX=${ARGON_TRANSFORMER_PREFIX}
                        -DPREBUILD_ARGON_API_PATH=${NGRAPH_PREBUILD_ARGON_API_PATH}
                        -DEXTERNAL_NGRAPH_INCLUDE_DIR=${NGRAPH_INCLUDE_PATH}
                        -DINSTALLED_HEADERS_PATH=${CMAKE_INSTALL_PREFIX}/include
+                        -DMKLDNN_INCLUDE_DIR=${MKLDNN_INCLUDE_DIR}
                BUILD_ALWAYS 1
            )
        else()
            ExternalProject_Add(
                ext_argon_transformer
-                GIT_REPOSITORY ${ARGON_TRANSFORMER_CMAKE_GIT_REPOSITORY}
+                GIT_REPOSITORY ${ARGON_TRANSFORMER_GIT_REPOSITORY}
-                GIT_TAG ${ARGON_TRANSFORMER_CMAKE_GIT_TAG}
+                GIT_TAG ${ARGON_TRANSFORMER_GIT_TAG}
-                PREFIX ${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                PREFIX ${ARGON_TRANSFORMER_PREFIX}
                UPDATE_COMMAND ""
-                CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                CMAKE_ARGS
-                        -DNGRAPH_INSTALL_PREFIX=${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DNGRAPH_INSTALL_PREFIX=${ARGON_TRANSFORMER_PREFIX}
                        -DPREBUILD_ARGON_API_PATH=${NGRAPH_PREBUILD_ARGON_API_PATH}
                        -DEXTERNAL_NGRAPH_INCLUDE_DIR=${NGRAPH_INCLUDE_PATH}
                        -DINSTALLED_HEADERS_PATH=${CMAKE_INSTALL_PREFIX}/include
+                        -DMKLDNN_INCLUDE_DIR=${MKLDNN_INCLUDE_DIR}
                BUILD_ALWAYS 1
            )
        endif()
@@ -85,29 +97,37 @@ if (NGRAPH_ARGON_ENABLE)
            ExternalProject_Add(
                ext_argon_transformer
                SOURCE_DIR ${CUSTOM_ARGON_TRANSFORMER_DIR}
-                PREFIX ${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                PREFIX ${ARGON_TRANSFORMER_PREFIX}
                UPDATE_COMMAND ""
-                CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                CMAKE_ARGS
-                        -DNGRAPH_INSTALL_PREFIX=${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DNGRAPH_INSTALL_PREFIX=${ARGON_TRANSFORMER_PREFIX}
                        -DPREBUILD_ARGON_API_PATH=${NGRAPH_PREBUILD_ARGON_API_PATH}
                        -DEXTERNAL_NGRAPH_INCLUDE_DIR=${NGRAPH_INCLUDE_PATH}
                        -DINSTALLED_HEADERS_PATH=${CMAKE_INSTALL_PREFIX}/include
-                BUILD_BYPRODUCTS ${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                        -DMKLDNN_INCLUDE_DIR=${MKLDNN_INCLUDE_DIR}
+                BUILD_BYPRODUCTS ${ARGON_TRANSFORMER_PREFIX}
                BUILD_ALWAYS 1
            )
        else()
            ExternalProject_Add(
                ext_argon_transformer
-                GIT_REPOSITORY ${ARGON_TRANSFORMER_CMAKE_GIT_REPOSITORY}
+                GIT_REPOSITORY ${ARGON_TRANSFORMER_GIT_REPOSITORY}
-                GIT_TAG ${ARGON_TRANSFORMER_CMAKE_GIT_TAG}
+                GIT_TAG ${ARGON_TRANSFORMER_GIT_TAG}
-                PREFIX ${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                PREFIX ${ARGON_TRANSFORMER_PREFIX}
                UPDATE_COMMAND ""
-                CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                CMAKE_ARGS
-                        -DNGRAPH_INSTALL_PREFIX=${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DNGRAPH_INSTALL_PREFIX=${ARGON_TRANSFORMER_PREFIX}
                        -DPREBUILD_ARGON_API_PATH=${NGRAPH_PREBUILD_ARGON_API_PATH}
                        -DEXTERNAL_NGRAPH_INCLUDE_DIR=${NGRAPH_INCLUDE_PATH}
                        -DINSTALLED_HEADERS_PATH=${CMAKE_INSTALL_PREFIX}/include
-                BUILD_BYPRODUCTS ${ARGON_TRANSFORMER_CMAKE_PREFIX}
+                        -DMKLDNN_INCLUDE_DIR=${MKLDNN_INCLUDE_DIR}
+                BUILD_BYPRODUCTS ${ARGON_TRANSFORMER_PREFIX}
                BUILD_ALWAYS 1
            )
        endif()
@@ -115,8 +135,8 @@ if (NGRAPH_ARGON_ENABLE)
    ExternalProject_Get_Property(ext_argon_transformer source_dir)
    set(ARGON_TRANSFORMER_SOURCE_DIR ${source_dir} PARENT_SCOPE)
-    set(ARGON_TRANSFORMER_INCLUDE_DIR ${ARGON_TRANSFORMER_CMAKE_PREFIX}/include PARENT_SCOPE)
+    set(ARGON_TRANSFORMER_INCLUDE_DIR ${ARGON_TRANSFORMER_PREFIX}/include PARENT_SCOPE)
-    set(ARGON_TRANSFORMER_LIB_DIR ${ARGON_TRANSFORMER_CMAKE_PREFIX}/lib PARENT_SCOPE)
+    set(ARGON_TRANSFORMER_LIB_DIR ${ARGON_TRANSFORMER_PREFIX}/lib PARENT_SCOPE)
    set(ARGON_API_INCLUDE_DIR ${NGRAPH_PREBUILD_ARGON_API_PATH}/include PARENT_SCOPE)
    set(ARGON_API_LIB_DIR ${NGRAPH_PREBUILD_ARGON_API_PATH}/lib) # Used by find_library below
    set(ARGON_API_LIB_DIR ${NGRAPH_PREBUILD_ARGON_API_PATH}/lib PARENT_SCOPE)

--- a/cmake/external_mkldnn.cmake
+++ b/cmake/external_mkldnn.cmake
@@ -81,4 +81,8 @@ if(NGRAPH_CPU_ENABLE)
    set(MKLDNN_INCLUDE_DIR "${EXTERNAL_PROJECTS_ROOT}/mkldnn/include" PARENT_SCOPE)
    set(MKLDNN_LIB_DIR "${EXTERNAL_PROJECTS_ROOT}/mkldnn/lib" PARENT_SCOPE)
+    # Other .cmake files in current scope (e.g. Argon Transformer) needs this path as well
+    set(MKLDNN_INCLUDE_DIR "${EXTERNAL_PROJECTS_ROOT}/mkldnn/include")
+    set(MKLDNN_LIB_DIR "${EXTERNAL_PROJECTS_ROOT}/mkldnn/lib")
 endif()
--- a/doc/sphinx/ngraph.doxyfile_backup
+++ b/doc/sphinx/ngraph.doxyfile_backup
--- a/doc/sphinx/ngraph_theme/static/css/theme.css.backup
+++ b/doc/sphinx/ngraph_theme/static/css/theme.css.backup
--- a/maint/apply-code-format.sh
+++ b/maint/apply-code-format.sh
@@ -42,21 +42,23 @@ bash_lib_status "Verified that '${CLANG_FORMAT_PROG}' has version '${REQUIRED_CL
 pushd "${THIS_SCRIPT_DIR}/.."
+declare ARGON_SRC_DIR="build/third-party/argon_transformer/src/ext_argon_transformer/src"
+declare ARGON_TEST_DIR="build/third-party/argon_transformer/src/ext_argon_transformer/test"
 declare ROOT_SUBDIR
-for ROOT_SUBDIR in src test; do
+for ROOT_SUBDIR in src test ${ARGON_SRC_DIR} ${ARGON_TEST_DIR}; do
    if ! [[ -d "${ROOT_SUBDIR}" ]]; then
-	bash_lib_die "In directory '$(pwd)', no subdirectory named '${ROOT_SUBDIR}' was found."
+	    bash_lib_status "In directory '$(pwd)', no subdirectory named '${ROOT_SUBDIR}' was found."
-    fi
+    else
+        bash_lib_status "About to format C/C++ code in directory tree '$(pwd)/${ROOT_SUBDIR}' ..."
-    bash_lib_status "About to format C/C++ code in directory tree '$(pwd)/${ROOT_SUBDIR}' ..."
-    # Note that we restrict to "-type f" to exclude symlinks. Emacs sometimes
+        # Note that we restrict to "-type f" to exclude symlinks. Emacs sometimes
-    # creates dangling symlinks with .cpp/.hpp suffixes as a sort of locking
+        # creates dangling symlinks with .cpp/.hpp suffixes as a sort of locking
-    # mechanism, and this confuses clang-format.
+        # mechanism, and this confuses clang-format.
-    find "${ROOT_SUBDIR}" -type f -and \( -name '*.cpp' -or -name '*.hpp' \) | xargs "${CLANG_FORMAT_PROG}" -i -style=file
+        find "${ROOT_SUBDIR}" -type f -and \( -name '*.cpp' -or -name '*.hpp' \) | xargs "${CLANG_FORMAT_PROG}" -i -style=file
-    bash_lib_status "Done."
+        bash_lib_status "Done."
+    fi
 done
 popd
--- a/maint/check-code-format.sh
+++ b/maint/check-code-format.sh
@@ -45,23 +45,26 @@ declare NUM_FILES_CHECKED=0
 pushd "${THIS_SCRIPT_DIR}/.."
+declare ARGON_SRC_DIR="build/third-party/argon_transformer/src/ext_argon_transformer/src"
+declare ARGON_TEST_DIR="build/third-party/argon_transformer/src/ext_argon_transformer/test"
 declare ROOT_SUBDIR
-for ROOT_SUBDIR in src test; do
+for ROOT_SUBDIR in src test ${ARGON_SRC_DIR} ${ARGON_TEST_DIR}; do
    if ! [[ -d "${ROOT_SUBDIR}" ]]; then
-        bash_lib_die "In directory '$(pwd)', no subdirectory named '${ROOT_SUBDIR}' was found."
+        bash_lib_status "In directory '$(pwd)', no subdirectory named '${ROOT_SUBDIR}' was found."
+    else
+        bash_lib_status "About to format C/C++ code in directory tree '$(pwd)/${ROOT_SUBDIR}' ..."
+        declare SRC_FILE
+        # Note that we restrict to "-type f" to exclude symlinks. Emacs sometimes
+        # creates dangling symlinks with .cpp/.hpp suffixes as a sort of locking
+        # mechanism, and this confuses clang-format.
+        for SRC_FILE in $(find "${ROOT_SUBDIR}" -type f -and \( -name '*.cpp' -or -name '*.hpp' \) ); do
+            if "${CLANG_FORMAT_PROG}" -style=file -output-replacements-xml "${SRC_FILE}" | grep -c "<replacement " >/dev/null; then
+                FAILED_FILES+=( "${SRC_FILE}" )
+            fi
+            NUM_FILES_CHECKED=$((NUM_FILES_CHECKED+1))
+        done
    fi
-    bash_lib_status "About to format C/C++ code in directory tree '$(pwd)/${ROOT_SUBDIR}' ..."
-    declare SRC_FILE
-    # Note that we restrict to "-type f" to exclude symlinks. Emacs sometimes
-    # creates dangling symlinks with .cpp/.hpp suffixes as a sort of locking
-    # mechanism, and this confuses clang-format.
-    for SRC_FILE in $(find "${ROOT_SUBDIR}" -type f -and \( -name '*.cpp' -or -name '*.hpp' \) ); do
-        if "${CLANG_FORMAT_PROG}" -style=file -output-replacements-xml "${SRC_FILE}" | grep -c "<replacement " >/dev/null; then
-            FAILED_FILES+=( "${SRC_FILE}" )
-        fi
-        NUM_FILES_CHECKED=$((NUM_FILES_CHECKED+1))
-    done
 done
 popd
@@ -76,4 +79,3 @@ else
    done
    exit 1
 fi
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -38,7 +38,7 @@ set (SRC
    ops/avg_pool.cpp
    ops/batch_norm.cpp
    ops/broadcast.cpp
-    ops/concatenate.cpp
+    ops/concat.cpp
    ops/constant.cpp
    ops/convert.cpp
    ops/convolution.cpp
@@ -78,12 +78,13 @@ set (SRC
    ops/tan.cpp
    ops/tanh.cpp
    ops/util/arithmetic_reduction.cpp
-    ops/util/binary_elementwise_arithmetic.cpp 
+    ops/util/binary_elementwise_arithmetic.cpp
-    ops/util/binary_elementwise_comparison.cpp 
+    ops/util/binary_elementwise_comparison.cpp
-    ops/util/binary_elementwise.cpp 
+    ops/util/binary_elementwise.cpp
-    ops/util/requires_tensor_view_args.cpp 
+    ops/util/requires_tensor_view_args.cpp
-    ops/util/unary_elementwise_arithmetic.cpp 
+    ops/util/unary_elementwise_arithmetic.cpp
    ops/util/unary_elementwise.cpp
+    pass/assign_placement.cpp
    pass/dump_sorted.cpp
    pass/graph_rewrite.cpp
    pass/inliner.cpp
@@ -112,6 +113,7 @@ set (SRC
    types/type.cpp
    util.cpp
    graph_util.cpp
+    placement.cpp
    )
 message(STATUS  ${CMAKE_CURRENT_SOURCE_DIR}/ops)

--- a/src/ngraph/codegen/compiler.cpp
+++ b/src/ngraph/codegen/compiler.cpp
@@ -411,24 +411,21 @@ void codegen::StaticCompiler::configure_search_path()
 void codegen::StaticCompiler::load_headers_from_resource()
 {
+    const string builtin_root = "/$builtin";
    HeaderSearchOptions& hso = m_compiler->getInvocation().getHeaderSearchOpts();
    PreprocessorOptions& preprocessor_options = m_compiler->getInvocation().getPreprocessorOpts();
-    std::set<std::string> header_search_paths;
+    for (const string& search_path : builtin_search_paths)
-    for (const HeaderInfo& hi : header_info)
    {
-        string search_path = hi.search_path;
+        string builtin = builtin_root + search_path;
-        string absolute_path = file_util::path_join(search_path, hi.header_path);
+        hso.AddPath(builtin, clang::frontend::System, false, false);
-        string builtin = "/$builtin" + absolute_path;
+    }
+    for (const pair<string, string>& header_info : builtin_headers)
+    {
+        string absolute_path = header_info.first;
+        string builtin = builtin_root + absolute_path;
        std::unique_ptr<llvm::MemoryBuffer> mb(
-            llvm::MemoryBuffer::getMemBuffer(hi.header_data, builtin));
+            llvm::MemoryBuffer::getMemBuffer(header_info.second, builtin));
        preprocessor_options.addRemappedFile(builtin, mb.release());
-        if (!contains(header_search_paths, search_path))
-        {
-            string builtin = "/$builtin" + search_path;
-            hso.AddPath(builtin, clang::frontend::System, false, false);
-            header_search_paths.insert(search_path);
-        }
    }
 }

--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -30,11 +30,24 @@
 #include <unordered_set>
 #include <vector>
+#include "ngraph/placement.hpp"
 namespace ngraph
 {
    class Node;
    class Function;
+    namespace descriptor
+    {
+        class Input;
+        class Output;
+    }
+    namespace op
+    {
+        class Parameter;
+    }
    void traverse_nodes(const std::shared_ptr<const Function> p,
                        std::function<void(std::shared_ptr<Node>)> f);
    void traverse_nodes(const Function* p, std::function<void(std::shared_ptr<Node>)> f);
@@ -60,7 +73,7 @@ namespace ngraph
    class NodeMap
    {
    public:
-        // map original node to replcacement node
+        // map original node to replacement node
        // throws ngraph_error if key already exists
        void add(std::shared_ptr<ngraph::Node> orig, std::shared_ptr<ngraph::Node> replacement);
@@ -100,4 +113,18 @@ namespace ngraph
    // NodeMap output (by reference) fully maps input and cloned function ops
    std::shared_ptr<ngraph::Function> clone_function(std::shared_ptr<ngraph::Function> func,
                                                     NodeMap& node_map);
+    // Assert that nodes in the function is colocated and return that placement
+    Placement get_colocated_function_placement(std::shared_ptr<Function> func);
+    // Split function to function(s) with unique placement
+    std::vector<std::shared_ptr<Function>> split_function_by_placement(
+        std::shared_ptr<Function> f,
+        std::unordered_map<std::shared_ptr<op::Parameter>, std::shared_ptr<Node>>&
+            map_parameter_to_source_node);
+    // Insert parameter node between src_node and dst_node by splitting the graph
+    void insert_parameter_split_between(std::shared_ptr<Node> src_node,
+                                        std::shared_ptr<Node> dst_node,
+                                        std::shared_ptr<op::Parameter> p_node);
 }
--- a/src/ngraph/ngraph.hpp
+++ b/src/ngraph/ngraph.hpp
@@ -73,7 +73,7 @@
 #include "ngraph/ops/avg_pool.hpp"
 #include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/ceiling.hpp"
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convert.hpp"
 #include "ngraph/ops/convolution.hpp"

--- a/src/ngraph/node.cpp
+++ b/src/ngraph/node.cpp
@@ -23,6 +23,7 @@
 #include "ngraph/descriptor/layout/tensor_view_layout.hpp"
 #include "ngraph/descriptor/primary_tensor_view.hpp"
 #include "ngraph/ops/parameter.hpp"
+#include "ngraph/placement.hpp"
 using namespace std;
 using namespace ngraph;
@@ -144,6 +145,16 @@ void Node::set_name(const string& name)
    }
 }
+Placement Node::get_placement() const
+{
+    return m_placement;
+}
+void Node::set_placement(Placement placement)
+{
+    m_placement = placement;
+}
 std::shared_ptr<Node> Node::get_input_op(size_t index)
 {
    for (auto arg : m_arguments)
@@ -304,3 +315,27 @@ bool Node::has_same_type(std::shared_ptr<const Node> node) const
    }
    return true;
 }
+descriptor::Input* Node::get_input_from(const shared_ptr<Node>& src)
+{
+    for (size_t i = 0; i < this->get_input_size(); ++i)
+    {
+        if (this->get_input_op(i) == src)
+        {
+            return &(this->get_inputs().at(i));
+        }
+    }
+    throw ngraph_error("Error: src is not one of self's input Node");
+}
+descriptor::Output* Node::get_output_to(const shared_ptr<Node>& dst)
+{
+    for (size_t i = 0; i < dst->get_input_size(); ++i)
+    {
+        if (dst->get_input_op(i).get() == this)
+        {
+            return &(dst->get_inputs().at(i).get_output());
+        }
+    }
+    throw ngraph_error("Error: dst is not one of self's output Node");
+}
--- a/src/ngraph/node.hpp
+++ b/src/ngraph/node.hpp
@@ -32,6 +32,7 @@
 #include "ngraph/descriptor/input.hpp"
 #include "ngraph/descriptor/output.hpp"
 #include "ngraph/descriptor/tensor.hpp"
+#include "ngraph/placement.hpp"
 #include "ngraph/types/type.hpp"
 namespace ngraph
@@ -39,6 +40,10 @@ namespace ngraph
    void replace_node_users_arguments(std::shared_ptr<Node> target,
                                      std::shared_ptr<Node> replacement);
+    void insert_parameter_split_between(std::shared_ptr<Node> src_node,
+                                        std::shared_ptr<Node> dst_node,
+                                        std::shared_ptr<op::Parameter> p_node);
    /// Nodes are the backbone of the graph of Value dataflow. Every node has
    /// zero or more nodes as arguments and one value, which is either a tensor
    /// view or a (possibly empty) tuple of values.
@@ -49,6 +54,9 @@ namespace ngraph
        friend class descriptor::Input;
        friend void replace_node_users_arguments(std::shared_ptr<Node> target,
                                                 std::shared_ptr<Node> replacement);
+        friend void insert_parameter_split_between(std::shared_ptr<Node> src_node,
+                                                   std::shared_ptr<Node> dst_node,
+                                                   std::shared_ptr<op::Parameter> p_node);
    protected:
        Node(const std::string& node_type, const Nodes& arguments);
@@ -165,9 +173,21 @@ namespace ngraph
        virtual std::vector<std::shared_ptr<Function>> get_functions() const;
-        // True if this and node have one output with same element type and shape
+        /// True if this and node have one output with same element type and shape
        bool has_same_type(std::shared_ptr<const Node> node) const;
+        /// Get device placement
+        Placement get_placement() const;
+        /// Set device placement
+        void set_placement(Placement placement);
+        /// Get input descriptor that is connected to src
+        descriptor::Input* get_input_from(const std::shared_ptr<Node>& src);
+        /// Get ouput descriptor that outputs to dst
+        descriptor::Output* get_output_to(const std::shared_ptr<Node>& dst);
    protected:
        void add_output(const element::Type& element_type, const Shape& shape);
@@ -180,9 +200,11 @@ namespace ngraph
        std::deque<descriptor::Output> m_outputs;
        bool m_is_output;
        std::unordered_map<Node*, autodiff::Adjoints> m_adjoint_map;
+        Placement m_placement = Placement::DEFAULT;
    private:
        Nodes m_arguments;
        //m_arguments still needs to be kept in sync with i/o since get_input_ops
        //is pretty ubiquitous and might be called after the original graph was modified.
        //get_input_ops uses m_arguments to check if a node view reconstruction from i/o

--- a/src/ngraph/ops/concat.cpp
+++ b/src/ngraph/ops/concat.cpp
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <memory>
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/slice.hpp"
 using namespace std;

--- a/src/ngraph/ops/concatenate.cpp
+++ b/src/ngraph/ops/concatenate.cpp
-/*******************************************************************************
-* Copyright 2017-2018 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include <cassert>
-#include <memory>
-#include "ngraph/ops/concatenate.hpp"
-#include "ngraph/ops/slice.hpp"
-using namespace std;
-using namespace ngraph;
-op::Concat::Concat(const Nodes& args, size_t concatenation_axis)
-    : RequiresTensorViewArgs("Concat", args)
-    , m_concatenation_axis(concatenation_axis)
-{
-    if (m_inputs.size() < 1)
-    {
-        throw ngraph_error("At least one argument required");
-    }
-    auto& input_0 = get_inputs().at(0);
-    auto input_0_shape = input_0.get_shape();
-    if (m_concatenation_axis >= input_0_shape.size())
-    {
-        throw ngraph_error("Concatenation axis is out of bounds");
-    }
-    size_t concatenation_axis_length = input_0_shape.at(m_concatenation_axis);
-    auto& input_0_element_type = input_0.get_element_type();
-    for (auto i = 1; i < get_inputs().size(); i++)
-    {
-        auto& input_i = get_inputs().at(i);
-        auto input_i_shape = input_i.get_shape();
-        if (input_i_shape.size() != input_0_shape.size())
-        {
-            throw ngraph_error("Arguments to concat do not have same rank");
-        }
-        if (input_i.get_element_type() != input_0_element_type)
-        {
-            throw ngraph_error("Argument element types do not match");
-        }
-        for (auto j = 0; j < input_i_shape.size(); j++)
-        {
-            if (j != m_concatenation_axis && input_0_shape.at(j) != input_i_shape.at(j))
-            {
-                throw ngraph_error(
-                    "Arguments to concat do not have same dimension on a non-concatenation axis");
-            }
-            else if (j == m_concatenation_axis)
-            {
-                concatenation_axis_length += input_i_shape.at(j);
-            }
-        }
-    }
-    vector<size_t> concatenated_shape = input_0_shape;
-    concatenated_shape.at(m_concatenation_axis) = concatenation_axis_length;
-    set_value_type_checked(make_shared<TensorViewType>(input_0_element_type, concatenated_shape));
-}
-void op::Concat::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_ptr<Node>& delta)
-{
-    auto concat_result_shape = get_outputs().at(0).get_shape();
-    Coordinate arg_delta_slice_lower = Coordinate(concat_result_shape.size(), 0);
-    Coordinate arg_delta_slice_upper = concat_result_shape;
-    Coordinate arg_delta_slice_strides = Coordinate(concat_result_shape.size(), 1);
-    size_t pos = 0;
-    for (auto arg : get_input_ops())
-    {
-        auto arg_shape = arg->get_shape();
-        auto slice_width = arg_shape[m_concatenation_axis];
-        size_t next_pos = pos + slice_width;
-        arg_delta_slice_lower[m_concatenation_axis] = pos;
-        arg_delta_slice_upper[m_concatenation_axis] = next_pos;
-        adjoints.add_delta(
-            arg,
-            make_shared<op::Slice>(
-                delta, arg_delta_slice_lower, arg_delta_slice_upper, arg_delta_slice_strides));
-        pos = next_pos;
-    }
-}
--- a/src/ngraph/pass/assign_placement.cpp
+++ b/src/ngraph/pass/assign_placement.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/pass/assign_placement.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/placement.hpp"
+using namespace std;
+using namespace ngraph;
+ngraph::pass::AssignPlacement::AssignPlacement(
+    std::function<Placement(std::shared_ptr<Node>)> placement_policy)
+    : m_placement_policy(placement_policy)
+{
+}
+bool ngraph::pass::AssignPlacement::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
+{
+    for (const std::shared_ptr<Node>& node : nodes)
+    {
+        run_on_node(node);
+    }
+    return false;
+}
+bool ngraph::pass::AssignPlacement::run_on_node(shared_ptr<Node> node)
+{
+    node->set_placement(m_placement_policy(node));
+    return false;
+}
--- a/src/ngraph/ops/concatenate.hpp
+++ b/src/ngraph/ops/concatenate.hpp
@@ -16,36 +16,26 @@
 #pragma once
-#include <memory>
+#include <exception>
+#include <sstream>
-#include "ngraph/ops/util/requires_tensor_view_args.hpp"
+#include "ngraph/pass/pass.hpp"
+#include "ngraph/placement.hpp"
 namespace ngraph
 {
-    namespace op
+    namespace pass
    {
-        /// \brief Concatenation operation.
+        class AssignPlacement : public CallGraphPass
-        class Concat : public util::RequiresTensorViewArgs
        {
        public:
-            /// \brief Constructs a concatenation operation.
+            // TODO: make policy a class
-            ///
+            AssignPlacement(std::function<Placement(std::shared_ptr<Node>)> placement_policy);
-            /// \param args               The nodes producing the input tensors.
+            virtual bool run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
-            /// \param concatenation_axis The axis along which to concatenate the input tensors.
-            Concat(const Nodes& args, size_t concatenation_axis);
-            virtual std::shared_ptr<Node> copy_with_new_args(
+        private:
-                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            bool run_on_node(std::shared_ptr<Node> node);
-            {
+            std::function<Placement(std::shared_ptr<Node>)> m_placement_policy;
-                return std::make_shared<Concat>(new_args, m_concatenation_axis);
-            }
-            /// \return The concatenation axis.
-            size_t get_concatenation_axis() const { return m_concatenation_axis; }
-        protected:
-            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
-                                           const std::shared_ptr<Node>& delta) override;
-            const size_t m_concatenation_axis;
        };
    }
 }
--- a/src/ngraph/pass/manager.cpp
+++ b/src/ngraph/pass/manager.cpp
@@ -30,6 +30,12 @@ using namespace std;
 using namespace ngraph;
 ngraph::pass::Manager::Manager()
+    : m_to_set_is_output(true)
+{
+}
+ngraph::pass::Manager::Manager(bool to_set_is_output)
+    : m_to_set_is_output(to_set_is_output)
 {
 }
@@ -50,11 +56,14 @@ void ngraph::pass::Manager::run_passes(shared_ptr<Function> func)
    set<shared_ptr<Function>> tfs(begin(fs), end(fs));
    get_state().set_functions(tfs);
-    for (shared_ptr<Function> f : get_state().get_functions())
+    if (m_to_set_is_output)
    {
-        for (size_t i = 0; i < f->get_output_size(); ++i)
+        for (shared_ptr<Function> f : get_state().get_functions())
        {
-            f->get_output_op(i)->set_is_output();
+            for (size_t i = 0; i < f->get_output_size(); ++i)
+            {
+                f->get_output_op(i)->set_is_output();
+            }
        }
    }

--- a/src/ngraph/pass/manager.hpp
+++ b/src/ngraph/pass/manager.hpp
@@ -36,6 +36,7 @@ class ngraph::pass::Manager
 {
 public:
    Manager();
+    Manager(bool to_set_is_output);
    ~Manager();
    void initialize_default_passes();
@@ -56,4 +57,5 @@ public:
 private:
    std::vector<std::shared_ptr<PassBase>> m_pass_list;
    ManagerState m_state;
+    bool m_to_set_is_output;
 };
--- a/src/ngraph/placement.cpp
+++ b/src/ngraph/placement.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/placement.hpp"
+std::string ngraph::placement_to_string(Placement placement)
+{
+    switch (placement)
+    {
+    case Placement::DEFAULT: return "DEFAULT";
+    case Placement::INTERPRETER: return "INTERPRETER";
+    case Placement::CPU: return "CPU";
+    case Placement::GPU: return "GPU";
+    case Placement::ARGON: return "ARGON";
+    }
+}
--- a/src/ngraph/placement.hpp
+++ b/src/ngraph/placement.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <string>
+namespace ngraph
+{
+    enum class Placement
+    {
+        DEFAULT,
+        INTERPRETER,
+        CPU,
+        GPU,
+        ARGON,
+    };
+    std::string placement_to_string(Placement placement);
+}
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -43,7 +43,7 @@
 #include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/ceiling.hpp"
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convert.hpp"
 #include "ngraph/ops/convolution.hpp"

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -107,8 +107,9 @@ void runtime::cpu::CPUTensorView::read(void* target, size_t tensor_offset, size_
    auto tvl = this->get_tensor_view_layout();
    auto cpu_tvl = dynamic_cast<runtime::cpu::LayoutDescriptor*>(tvl.get());
    if (cpu_tvl && cpu_tvl->get_mkldnn_format() != memory::format::format_undef &&
-        cpu_tvl->get_mkldnn_format() !=
+        !runtime::cpu::mkldnn_utils::compare_mkldnn_formats(
-            runtime::cpu::mkldnn_utils::CreateNativeDataFormat(*cpu_tvl))
+            cpu_tvl->get_mkldnn_format(),
+            runtime::cpu::mkldnn_utils::CreateNativeDataFormat(*cpu_tvl)))
    {
        auto tensor_shape = this->get_shape();
        auto input_format = cpu_tvl->get_mkldnn_format();

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -25,12 +25,15 @@
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/max_pool.hpp"
 #include "ngraph/ops/relu.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/types/element_type.hpp"
 #include "mkldnn_utils.hpp"
 using namespace mkldnn;
 using namespace ngraph;
+using namespace std;
 #define TI(x) std::type_index(typeid(x))
@@ -94,6 +97,7 @@ static const std::map<memory::format, const std::string> s_mkldnn_format_string_
    {memory::format::oIhw16i, "memory::format::oIhw16i"},
    {memory::format::OIhw8i8o, "memory::format::OIhw8i8o"},
    {memory::format::OIhw16i16o, "memory::format::OIhw16i16o"},
+    {memory::format::IOhw16o16i, "memory::format::IOhw16o16i"},
    {memory::format::OIhw8o8i, "memory::format::OIhw8o8i"},
    {memory::format::OIhw16o16i, "memory::format::OIhw16o16i"},
    {memory::format::Oihw8o, "memory::format::Oihw8o"},
@@ -148,3 +152,38 @@ const std::string& runtime::cpu::mkldnn_utils::get_mkldnn_format_string(memory::
                           std::to_string(fmt));
    return it->second;
 }
+mkldnn::memory::format runtime::cpu::mkldnn_utils::get_input_mkldnn_format(const Node* node,
+                                                                           int index)
+{
+    auto tvl = node->get_inputs()[index].get_output().get_tensor_view()->get_tensor_view_layout();
+    return dynamic_cast<runtime::cpu::LayoutDescriptor&>(*tvl).get_mkldnn_format();
+}
+mkldnn::memory::format runtime::cpu::mkldnn_utils::get_output_mkldnn_format(const Node* node,
+                                                                            int index)
+{
+    auto tvl = node->get_output_tensor_view(0)->get_tensor_view_layout();
+    return dynamic_cast<runtime::cpu::LayoutDescriptor&>(*tvl).get_mkldnn_format();
+}
+bool runtime::cpu::mkldnn_utils::use_mkldnn_kernel(const ngraph::Node* node)
+{
+    auto op_annotations = static_cast<const ngraph::op::Op*>(node)->get_op_annotations();
+    return (op_annotations &&
+            static_pointer_cast<ngraph::runtime::cpu::CPUOpAnnotations>(op_annotations)
+                ->is_mkldnn_op());
+}
+bool runtime::cpu::mkldnn_utils::compare_mkldnn_formats(mkldnn::memory::format fmt1,
+                                                        mkldnn::memory::format fmt2)
+{
+    set<mkldnn::memory::format> similar_4d_formats{mkldnn::memory::format::nchw,
+                                                   mkldnn::memory::format::oihw};
+    if ((fmt1 == fmt2) || (similar_4d_formats.find(fmt1) != similar_4d_formats.end() &&
+                           similar_4d_formats.find(fmt2) != similar_4d_formats.end()))
+    {
+        return true;
+    }
+    return false;
+}
--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -38,6 +38,12 @@ namespace ngraph
                const std::string& get_mkldnn_data_type_string(const ngraph::element::Type& type);
                mkldnn::memory::data_type get_mkldnn_data_type(const ngraph::element::Type& type);
                const std::string& get_mkldnn_format_string(mkldnn::memory::format fmt);
+                mkldnn::memory::format get_input_mkldnn_format(const Node* node, int index);
+                mkldnn::memory::format get_output_mkldnn_format(const Node* node, int index);
+                bool use_mkldnn_kernel(const ngraph::Node* node);
+                bool compare_mkldnn_formats(mkldnn::memory::format fmt1,
+                                            mkldnn::memory::format fmt2);
            }
        }
    }

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -25,7 +25,9 @@
 #include <mkldnn.hpp>
 #include "ngraph/descriptor/output.hpp"
+#include "ngraph/ops/avg_pool.hpp"
 #include "ngraph/ops/convolution.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
@@ -66,6 +68,134 @@ namespace ngraph
                        convolution->set_op_annotations(op_annotations);
                    }
                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::ConvolutionBackpropData)
+                {
+                    auto convolution = static_cast<op::ConvolutionBackpropData*>(node);
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg1_shape = node->get_input_shape(1);
+                    auto result_shape = node->get_output_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto arg1_rank = arg1_shape.size();
+                    bool data_dilated = false;
+                    for (size_t s : convolution->get_data_dilation_strides_forward())
+                    {
+                        data_dilated = data_dilated || (s != 1);
+                    }
+                    if (!data_dilated && arg0_rank == 4 && arg1_rank == 4 &&
+                        node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        convolution->set_op_annotations(op_annotations);
+                    }
+                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::ConvolutionBackpropFilters)
+                {
+                    auto convolution = static_cast<op::ConvolutionBackpropFilters*>(node);
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg1_shape = node->get_input_shape(1);
+                    auto result_shape = node->get_output_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto arg1_rank = arg1_shape.size();
+                    bool data_dilated = false;
+                    for (size_t s : convolution->get_data_dilation_strides_forward())
+                    {
+                        data_dilated = data_dilated || (s != 1);
+                    }
+                    if (!data_dilated && arg0_rank == 4 && arg1_rank == 4 &&
+                        node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        convolution->set_op_annotations(op_annotations);
+                    }
+                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::AvgPool)
+                {
+                    auto avg_pool = static_cast<op::AvgPool*>(node);
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto result_shape = node->get_output_shape(0);
+                    if (arg0_rank == 4 && avg_pool->get_window_shape().size() == 2 &&
+                        node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        avg_pool->set_op_annotations(op_annotations);
+                    }
+                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::AvgPoolBackprop)
+                {
+                    auto avg_pool = static_cast<op::AvgPoolBackprop*>(node);
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto result_shape = node->get_output_shape(0);
+                    if (arg0_rank == 4 && avg_pool->get_window_shape().size() == 2 &&
+                        node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        avg_pool->set_op_annotations(op_annotations);
+                    }
+                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::Relu)
+                {
+                    auto avg_pool = static_cast<op::Relu*>(node);
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto result_shape = node->get_output_shape(0);
+                    if (arg0_rank == 4 && node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        avg_pool->set_op_annotations(op_annotations);
+                    }
+                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::ReluBackprop)
+                {
+                    auto avg_pool = static_cast<op::ReluBackprop*>(node);
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto result_shape = node->get_output_shape(0);
+                    if (arg0_rank == 4 && node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        avg_pool->set_op_annotations(op_annotations);
+                    }
+                }
            }
        }
    }
@@ -76,6 +206,16 @@ namespace ngraph
 static const runtime::cpu::pass::AssignOpMap s_dispatcher{
    {TI(ngraph::op::Convolution),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>},
+    {TI(ngraph::op::ConvolutionBackpropData),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropData>},
+    {TI(ngraph::op::ConvolutionBackpropFilters),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropFilters>},
+    {TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPool>},
+    {TI(ngraph::op::AvgPoolBackprop),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPoolBackprop>},
+    {TI(ngraph::op::Relu), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Relu>},
+    {TI(ngraph::op::ReluBackprop),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReluBackprop>},
 };
 bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
@@ -53,6 +53,13 @@ namespace ngraph
                private:
                    std::shared_ptr<CPU_ExternalFunction> m_external_function;
+                    static std::shared_ptr<Node> insert_input_conversions(
+                        CPU_ExternalFunction* external_function,
+                        std::shared_ptr<Node>& node,
+                        const std::vector<mkldnn::memory::format>& required_formats);
+                    static void set_output_layouts(
+                        std::shared_ptr<Node>& node,
+                        const std::vector<mkldnn::memory::format>& output_formats);
                    static void set_default_layouts(CPU_ExternalFunction* external_function,
                                                    std::shared_ptr<Node> node);
                };

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -29,7 +29,7 @@
 #include "ngraph/node.hpp"
 #include "ngraph/ops/broadcast.hpp"
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/dot.hpp"

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -45,7 +45,7 @@
 #include "ngraph/ops/atan.hpp"
 #include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/ceiling.hpp"
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convert.hpp"
 #include "ngraph/ops/convolution.hpp"

--- a/src/ngraph/runtime/interpreter/int_call_frame.hpp
+++ b/src/ngraph/runtime/interpreter/int_call_frame.hpp
@@ -25,7 +25,7 @@
 #include "ngraph/node.hpp"
 #include "ngraph/ops/avg_pool.hpp"
 #include "ngraph/ops/broadcast.hpp"
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/dot.hpp"

--- a/src/ngraph/runtime/interpreter/int_external_function.cpp
+++ b/src/ngraph/runtime/interpreter/int_external_function.cpp
@@ -34,7 +34,7 @@
 #include "ngraph/ops/asin.hpp"
 #include "ngraph/ops/atan.hpp"
 #include "ngraph/ops/broadcast.hpp"
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convert.hpp"
 #include "ngraph/ops/cos.hpp"

--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -25,7 +25,7 @@
 #include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/ceiling.hpp"
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convert.hpp"
 #include "ngraph/ops/convolution.hpp"
@@ -372,7 +372,7 @@ static shared_ptr<ngraph::Function>
            auto padding_below = node_js.at("padding_below").get<vector<size_t>>();
            auto padding_above = node_js.at("padding_above").get<vector<size_t>>();
            auto include_padding_in_avg_computation =
-                node_js.at("include_padding_in_avg_computation").get<bool>();
+                get_or_default<bool>(node_js, "include_padding_in_avg_computation", false);
            node = make_shared<op::AvgPoolBackprop>(forward_arg_shape,
                                                    args[0],
                                                    window_shape,
@@ -796,6 +796,11 @@ static shared_ptr<ngraph::Function>
            throw runtime_error(ss.str());
        }
        node_map[node_name] = node;
+        // Typically, it could be unsafe to change the name of a node since it may break nameing
+        // uniqueness. However, it could sometimes be helpful to use the original name from
+        // the serialization for debugging.
+        // node->set_name(node_name);
    }
    std::vector<std::shared_ptr<Node>> result;

--- a/src/ngraph/serializer.hpp
+++ b/src/ngraph/serializer.hpp
@@ -28,4 +28,19 @@ namespace ngraph
    std::string serialize(std::shared_ptr<ngraph::Function>, size_t indent = 0);
    std::shared_ptr<ngraph::Function> deserialize(std::istream&);
    std::shared_ptr<ngraph::Function> deserialize(const std::string&);
+    template <typename T>
+    T get_or_default(nlohmann::json& j, const std::string& key, const T& default_value)
+    {
+        T rc;
+        try
+        {
+            rc = j.at(key).get<T>();
+        }
+        catch (...)
+        {
+            rc = default_value;
+        }
+        return rc;
+    }
 }
--- a/src/resource/main.cpp
+++ b/src/resource/main.cpp
@@ -100,8 +100,8 @@ int main(int argc, char** argv)
    include_paths.push_back({CLANG_BUILTIN_HEADERS_PATH, {}, true});
    include_paths.push_back({"/usr/include/x86_64-linux-gnu", {"asm", "sys", "bits", "gnu"}});
-    include_paths.push_back({"/usr/include", {"asm", "sys", "bits", "gnu"}});
+    include_paths.push_back(
-    include_paths.push_back({"/usr/include", {"linux", "asm-generic"}});
+        {"/usr/include", {"asm", "sys", "bits", "gnu", "linux", "asm-generic"}});
    include_paths.push_back({cpp0, {"bits"}});
    include_paths.push_back({"/usr/include/c++/4.8.2/x86_64-redhat-linux", {"bits"}});
    include_paths.push_back({cpp1, {"bits", "ext", "debug", "backward"}});
@@ -168,63 +168,41 @@ int main(int argc, char** argv)
    if (update_needed)
    {
+        size_t total_size = 0;
+        size_t total_count = 0;
+        const string prefix = "pReFiX";
        ofstream out(output_path);
        out << "#pragma clang diagnostic ignored \"-Weverything\"\n";
        out << "#include <vector>\n";
        out << "namespace ngraph\n";
        out << "{\n";
-        out << "    static const uint8_t header_resources[] =\n";
+        out << "    const std::vector<std::string> builtin_search_paths =\n";
        out << "    {\n";
-        vector<pair<size_t, size_t>> offset_size_list;
-        size_t offset = 0;
-        size_t total_size = 0;
-        size_t total_count = 0;
        for (const ResourceInfo& path : include_paths)
        {
-            for (const string& header_file : path.files)
+            out << "        \"" << path.search_path << "\",\n";
+        }
+        out << "    };\n";
+        out << "    const std::vector<std::pair<std::string, std::string>> builtin_headers =\n";
+        out << "    {\n";
+        for (const ResourceInfo& path : include_paths)
+        {
+            for (const string& header_path : path.files)
            {
-                string header_data = read_file_to_string(header_file);
+                string header_data = read_file_to_string(header_path);
-                string base_path = header_file.substr(path.search_path.size() + 1);
+                string relative_path = header_path.substr(path.search_path.size() + 1);
-                header_data = rewrite_header(header_data, base_path);
+                header_data = rewrite_header(header_data, relative_path);
                // header_data = uncomment(header_data);
                total_size += header_data.size();
                total_count++;
-                // data layout is triplet of strings containing:
+                out << "        {";
-                // 1) search path
+                out << "\"" << header_path << "\",\nR\"" << prefix << "(" << header_data << ")"
-                // 2) header path within search path
+                    << prefix << "\"},\n";
-                // 3) header data
-                // all strings are null terminated and the length includes the null
-                // The + 1 below is to account for the null terminator
-                dump(out, path.search_path.c_str(), path.search_path.size() + 1);
-                offset_size_list.push_back({offset, path.search_path.size() + 1});
-                offset += path.search_path.size() + 1;
-                dump(out, header_file.c_str(), header_file.size() + 1);
-                offset_size_list.push_back({offset, header_file.size() + 1});
-                offset += header_file.size() + 1;
-                dump(out, header_data.c_str(), header_data.size() + 1);
-                offset_size_list.push_back({offset, header_data.size() + 1});
-                offset += header_data.size() + 1;
            }
        }
        out << "    };\n";
-        out << "    struct HeaderInfo\n";
-        out << "    {\n";
-        out << "        const char* search_path;\n";
-        out << "        const char* header_path;\n";
-        out << "        const char* header_data;\n";
-        out << "    };\n";
-        out << "    std::vector<HeaderInfo> header_info\n";
-        out << "    {\n";
-        for (size_t i = 0; i < offset_size_list.size();)
-        {
-            out << "        {(char*)(&header_resources[" << offset_size_list[i++].first;
-            out << "]), (char*)(&header_resources[" << offset_size_list[i++].first;
-            out << "]), (char*)(&header_resources[" << offset_size_list[i++].first << "])},\n";
-        }
-        out << "    };\n";
        out << "}\n";
        cout.imbue(locale(""));
        cout << "Total size " << total_size << " in " << total_count << " files\n";

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -39,6 +39,7 @@ set (SRC
    input_output_assign.cpp
    main.cpp
    op.cpp
+    graph_partition.cpp
    pass_liveness.cpp
    pass_manager.cpp
    pass_memory_layout.cpp
@@ -98,7 +99,7 @@ if(NGRAPH_ARGON_ENABLE)
    set(BACKEND_NAMES ${BACKEND_NAMES} "ARGON")
    set(SRC ${SRC} argon_fusion.cpp)
-    # enable additional Argon backend test 
+    # enable additional Argon backend test
    set(ARGON_ADDITIONAL_BACKEND_TESTS ${ARGON_TRANSFORMER_SOURCE_DIR}/test/test_argon_backend)
    message(STATUS "ARGON_ADDITIONAL_BACKEND_TESTS path: ${ARGON_ADDITIONAL_BACKEND_TESTS}")
    set(ADDITIONAL_ARGON_TEST
@@ -107,7 +108,7 @@ if(NGRAPH_ARGON_ENABLE)
            ${ARGON_ADDITIONAL_BACKEND_TESTS}/test_broadcast.cpp
            ${ARGON_ADDITIONAL_BACKEND_TESTS}/test_conv.cpp
            ${ARGON_ADDITIONAL_BACKEND_TESTS}/test_binary_ew.cpp)
    # ensures ADDITIONAL_ARGON_TEST are a dependency on argon transformer
    add_custom_command(OUTPUT ${ADDITIONAL_ARGON_TEST} DEPENDS ext_argon_transformer COMMAND "")
    set(SRC ${SRC} ${ADDITIONAL_ARGON_TEST})
@@ -193,4 +194,3 @@ add_custom_target(check
        style-check
        unit-test-check
        )
--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -497,6 +497,8 @@ TEST(${BACKEND_NAME}, backwards_broadcast1)
 TEST(${BACKEND_NAME}, backwards_concat_vector)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -522,6 +524,8 @@ TEST(${BACKEND_NAME}, backwards_concat_vector)
 TEST(${BACKEND_NAME}, backwards_concat_axis_0)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -547,6 +551,8 @@ TEST(${BACKEND_NAME}, backwards_concat_axis_0)
 TEST(${BACKEND_NAME}, backwards_concat_axis_1)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -572,6 +578,8 @@ TEST(${BACKEND_NAME}, backwards_concat_axis_1)
 TEST(${BACKEND_NAME}, backwards_ceiling)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -632,6 +640,8 @@ TEST(${BACKEND_NAME}, backwards_cos)
 TEST(${BACKEND_NAME}, backwards_cosh)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -654,6 +664,8 @@ TEST(${BACKEND_NAME}, backwards_cosh)
 TEST(${BACKEND_NAME}, backwards_divide)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -851,6 +863,8 @@ TEST(${BACKEND_NAME}, backwards_exp)
 TEST(${BACKEND_NAME}, backwards_floor)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1000,6 +1014,8 @@ TEST(${BACKEND_NAME}, backwards_parameter)
 TEST(${BACKEND_NAME}, backwards_power)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1068,6 +1084,8 @@ TEST(${BACKEND_NAME}, backwards_relu)
 TEST(${BACKEND_NAME}, backwards_replace_slice)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1113,6 +1131,8 @@ TEST(${BACKEND_NAME}, backwards_reshape)
 TEST(${BACKEND_NAME}, backwards_select)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1147,6 +1167,8 @@ TEST(${BACKEND_NAME}, backwards_select)
 TEST(${BACKEND_NAME}, backwards_select_nested)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1181,6 +1203,8 @@ TEST(${BACKEND_NAME}, backwards_select_nested)
 TEST(${BACKEND_NAME}, backwards_sign)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1235,6 +1259,8 @@ TEST(${BACKEND_NAME}, backwards_sin)
 TEST(${BACKEND_NAME}, backwards_sinh)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1257,9 +1283,10 @@ TEST(${BACKEND_NAME}, backwards_sinh)
 TEST(${BACKEND_NAME}, backwards_slice)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
    test::Uniform<float> rng(-10.0f, 10.0f);
    Shape shape{5, 5};
    auto make_graph = [shape]() {
@@ -1394,6 +1421,8 @@ TEST(${BACKEND_NAME}, backwards_sum_m2v_1)
 TEST(${BACKEND_NAME}, backwards_tan)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1429,6 +1458,8 @@ TEST(${BACKEND_NAME}, backwards_tan)
 TEST(${BACKEND_NAME}, backwards_tanh)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();

--- a/test/backend_performance.cpp
+++ b/test/backend_performance.cpp
@@ -24,7 +24,7 @@
 #include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/file_util.hpp"
 #include "ngraph/log.hpp"
-#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/concat.hpp"
 #include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
--- a/test/build_graph.cpp
+++ b/test/build_graph.cpp
@@ -16,7 +16,9 @@
 #include "gtest/gtest.h"
+#include "ngraph/file_util.hpp"
 #include "ngraph/ngraph.hpp"
+#include "ngraph/serializer.hpp"
 #include "util/test_tools.hpp"
 #include <memory>

--- a/test/convolution_test.in.cpp
+++ b/test/convolution_test.in.cpp
--- a/test/graph_partition.cpp
+++ b/test/graph_partition.cpp
--- a/test/ref_generators/generate_convolution_ref.py
+++ b/test/ref_generators/generate_convolution_ref.py
@@ -213,11 +213,11 @@ def emit_test(t,f):
 TEST (${BACKEND_NAME}, %s)
 {
    Shape shape_a{%s};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_b{%s};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
    Shape shape_r{%s};
-    auto make_graph = [A, B] {
+    auto make_graph = [shape_a, shape_b] {
+        auto A = make_shared<op::Parameter>(element::f32, shape_a);
+        auto B = make_shared<op::Parameter>(element::f32, shape_b);
        return make_shared<Function>(make_shared<op::Convolution>(A, B,
                                                                  Strides{%s},        // move_strides
                                                                  Strides{%s},        // filter_dilation

--- a/test/serialize.cpp
+++ b/test/serialize.cpp
@@ -106,6 +106,18 @@ TEST(serialize, existing_models)
    }
 }
+TEST(serialize, default_value)
+{
+    json j = {{"test1", 1}, {"test2", 2}};
+    int x1 = j.at("test1").get<int>();
+    EXPECT_EQ(x1, 1);
+    int x2 = get_or_default<int>(j, "test2", 0);
+    EXPECT_EQ(x2, 2);
+    int x3 = get_or_default<int>(j, "test3", 3);
+    EXPECT_EQ(x3, 3);
+}
 TEST(benchmark, serialize)
 {
    stopwatch timer;

--- a/test/util/all_close.hpp
+++ b/test/util/all_close.hpp
@@ -69,7 +69,9 @@ namespace ngraph
            }
            if (a->get_shape() != b->get_shape())
+            {
                return false;
+            }
            return all_close(read_vector<T>(a), read_vector<T>(b), rtol, atol);
        }

--- a/test/util/autodiff/numeric_compare.hpp
+++ b/test/util/autodiff/numeric_compare.hpp
@@ -14,9 +14,13 @@
 * limitations under the License.
 *******************************************************************************/
+#include "ngraph/log.hpp"
+#include "ngraph/runtime/manager.hpp"
+#include "ngraph/types/element_type.hpp"
 #include "util/all_close.hpp"
 #include "util/autodiff/backprop_derivative.hpp"
 #include "util/autodiff/numeric_derivative.hpp"
+#include "util/test_tools.hpp"
 template <typename T>
 bool autodiff_numeric_compare(const std::shared_ptr<ngraph::runtime::Manager>& manager,
@@ -27,15 +31,48 @@ bool autodiff_numeric_compare(const std::shared_ptr<ngraph::runtime::Manager>& m
                              T atol)
 {
    T delta = static_cast<T>(0.001);
+    // Use INTERPRETER to compute numerical derivatives
+    auto interpreter_manager = ngraph::runtime::Manager::get("INTERPRETER");
+    auto interpreter_backend = interpreter_manager->allocate_backend();
    auto f = make_graph();
+    std::vector<std::shared_ptr<ngraph::runtime::TensorView>> interpreter_args;
+    for (auto arg : args)
+    {
+        auto interpreter_arg = interpreter_backend->make_primary_tensor_view(
+            arg->get_tensor().get_element_type(), arg->get_shape());
+        // TODO: copy_data should not require T. Quick fix here for bool used in `Select`
+        if (arg->get_tensor().get_element_type() == ngraph::element::boolean)
+        {
+            copy_data(interpreter_arg, read_vector<char>(arg));
+        }
+        else
+        {
+            copy_data(interpreter_arg, read_vector<T>(arg));
+        }
+        interpreter_args.push_back(interpreter_arg);
+    }
    auto results_num = ngraph::autodiff::numeric_derivative<T>(
-        manager, backend, f, args, delta, f->get_parameters());
+        interpreter_manager, interpreter_backend, f, interpreter_args, delta, f->get_parameters());
+    // Use the backend being tested to compute symbolic derivatives
    auto g = make_graph();
    auto results_sym =
        ngraph::autodiff::backprop_derivative<T>(manager, backend, g, args, g->get_parameters());
-    return ngraph::test::all_close(results_num, results_sym, rtol, atol);
+    // Cast to HostTensorView for comparision
+    std::vector<std::shared_ptr<ngraph::runtime::TensorView>> interpreter_results_sym;
+    for (auto result : results_sym)
+    {
+        auto interpreter_result = interpreter_backend->make_primary_tensor_view(
+            ngraph::element::from<T>(), result->get_shape());
+        copy_data(interpreter_result, read_vector<T>(result));
+        interpreter_results_sym.push_back(interpreter_result);
+    }
+    return ngraph::test::all_close(results_num, interpreter_results_sym, rtol, atol);
 }
 template <typename T>
@@ -48,6 +85,7 @@ bool autodiff_numeric_compare_selective(
    T atol,
    const std::vector<bool>& indep_param_mask)
 {
+    // Use INTERPRETER to compute numerical derivatives
    std::vector<std::shared_ptr<ngraph::op::Parameter>> f_indep_params;
    auto f = make_graph();
@@ -62,9 +100,30 @@ bool autodiff_numeric_compare_selective(
        i++;
    }
-    auto results_num =
+    auto interpreter_manager = ngraph::runtime::Manager::get("INTERPRETER");
-        ngraph::autodiff::numeric_derivative<T>(manager, backend, f, args, .001f, f_indep_params);
+    auto interpreter_backend = interpreter_manager->allocate_backend();
+    std::vector<std::shared_ptr<ngraph::runtime::TensorView>> interpreter_args;
+    for (auto arg : args)
+    {
+        auto interpreter_arg = interpreter_backend->make_primary_tensor_view(
+            arg->get_tensor().get_element_type(), arg->get_shape());
+        // TODO: copy_data should not require T. Quick fix here for bool used in `Select`
+        if (arg->get_tensor().get_element_type() == ngraph::element::boolean)
+        {
+            copy_data(interpreter_arg, read_vector<char>(arg));
+        }
+        else
+        {
+            copy_data(interpreter_arg, read_vector<T>(arg));
+        }
+        interpreter_args.push_back(interpreter_arg);
+    }
+    auto results_num = ngraph::autodiff::numeric_derivative<T>(
+        interpreter_manager, interpreter_backend, f, interpreter_args, .001f, f_indep_params);
+    // Use the backend being tested to compute symbolic derivatives
    std::vector<std::shared_ptr<ngraph::op::Parameter>> g_indep_params;
    auto g = make_graph();
@@ -82,5 +141,15 @@ bool autodiff_numeric_compare_selective(
    auto results_sym =
        ngraph::autodiff::backprop_derivative<T>(manager, backend, g, args, g_indep_params);
-    return ngraph::test::all_close(results_num, results_sym, rtol, atol);
+    // Cast to HostTensorView for comparision
+    std::vector<std::shared_ptr<ngraph::runtime::TensorView>> interpreter_results_sym;
+    for (auto result : results_sym)
+    {
+        auto interpreter_result = interpreter_backend->make_primary_tensor_view(
+            ngraph::element::from<T>(), result->get_shape());
+        copy_data(interpreter_result, read_vector<T>(result));
+        interpreter_results_sym.push_back(interpreter_result);
+    }
+    return ngraph::test::all_close(results_num, interpreter_results_sym, rtol, atol);
 }
--- a/test/util/test_tools.hpp
+++ b/test/util/test_tools.hpp