Merge remote-tracking branch 'origin/master' into krovatkin/rs_concat

48c52da7 · nikolay.korovaiko · 02a6b07c · a534274f · 48c52da7 · 48c52da7
Commit 48c52da7 authored Jan 16, 2019 by nikolay.korovaiko
52 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,6 +105,7 @@ option(NGRAPH_GPU_ENABLE "Control the building of the GPU backend" FALSE)
 option(NGRAPH_INTERPRETER_ENABLE "Control the building of the INTERPRETER backend" TRUE)
 option(NGRAPH_NOP_ENABLE "Control the building of the NOP backend" TRUE)
 option(NGRAPH_GPUH_ENABLE "Control the building of the Hybrid GPU backend" FALSE)
+option(NGRAPH_GENERIC_CPU_ENABLE "Enable build nGraph for generic CPU backend" FALSE)
 option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE)
 option(NGRAPH_DEBUG_ENABLE "Enable output for NGRAPH_DEBUG statements" FALSE)
 option(NGRAPH_ONNX_IMPORT_ENABLE "Enable ONNX importer" FALSE)
@@ -125,6 +126,7 @@ message(STATUS "NGRAPH_GPU_ENABLE:            ${NGRAPH_GPU_ENABLE}")
 message(STATUS "NGRAPH_INTERPRETER_ENABLE:    ${NGRAPH_INTERPRETER_ENABLE}")
 message(STATUS "NGRAPH_NOP_ENABLE:            ${NGRAPH_NOP_ENABLE}")
 message(STATUS "NGRAPH_GPUH_ENABLE:           ${NGRAPH_GPUH_ENABLE}")
+message(STATUS "NGRAPH_GENERIC_CPU_ENABLE:    ${NGRAPH_GENERIC_CPU_ENABLE}")
 message(STATUS "NGRAPH_DISTRIBUTED_ENABLE:    ${NGRAPH_DISTRIBUTED_ENABLE}")
 message(STATUS "NGRAPH_DEBUG_ENABLE:          ${NGRAPH_DEBUG_ENABLE}")
 message(STATUS "NGRAPH_ONNX_IMPORT_ENABLE:    ${NGRAPH_ONNX_IMPORT_ENABLE}")

--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -6,11 +6,11 @@

 /cmake/                             @rkimballn1 @silee2

-/.ci/                               @aslepko @crlishka
-/.ci/travis/                        @postrational
-/.ci/onnx/                          @postrational
-/contrib/docker/                    @aslepko @crlishka
-/.travis.yml                        @postrational
+/.ci/                               @aslepko
+/.ci/travis/                        @aslepko @postrational
+/.ci/onnx/                          @aslepko @postrational
+/contrib/docker/                    @aslepko
+/.travis.yml                        @aslepko @postrational

 /.clang-format                      @rkimballn1
 /.gitattributes                     @rkimballn1

--- a/cmake/external_cldnn.cmake
+++ b/cmake/external_cldnn.cmake
@@ -22,7 +22,7 @@ include(ExternalProject)
 #------------------------------------------------------------------------------

 set(CLDNN_GIT_REPO_URL https://github.com/intel/clDNN.git)
-set(CLDNN_GIT_LABEL 02add7c4ce2baa81e2a32fa02d733dcc4f013108)
+set(CLDNN_GIT_LABEL f91d7d83d8f121e4e159776b108e316f2f08bdf5)
 set(BOOST_VERSION 1.64.0)
 set(OUT_DIR ${EXTERNAL_PROJECTS_ROOT}/cldnn/out)

@@ -66,5 +66,5 @@ else()
    ExternalProject_Get_Property(ext_cldnn SOURCE_DIR BINARY_DIR)
    add_dependencies(libcldnn ext_cldnn)
    target_include_directories(libcldnn SYSTEM INTERFACE ${SOURCE_DIR}/api)
-    target_link_libraries(libcldnn INTERFACE ${SOURCE_DIR}/build/out/Linux64/Release/libclDNN64.so)
+    target_link_libraries(libcldnn INTERFACE ${SOURCE_DIR}/build/out/Linux64/${CMAKE_BUILD_TYPE}/${CMAKE_SHARED_LIBRARY_PREFIX}clDNN64${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -42,3 +42,7 @@ add_custom_command(
 )

 add_custom_target(python_wheel DEPENDS ngraph ${CMAKE_BINARY_DIR}/python/dist/)
+if (NGRAPH_CPU_ENABLE)
+    add_dependencies(python_wheel ext_mkldnn)
+endif()
+
--- a/python/setup.py
+++ b/python/setup.py
@@ -374,6 +374,10 @@ class BuildExt(build_ext):
        build_ext.build_extensions(self)


+if sys.platform == 'darwin':
+    # This turns out to be needed when building using Anaconda python on macOS.
+    os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.9'
+
 with open(os.path.join(PYNGRAPH_ROOT_DIR, 'requirements.txt')) as req:
    requirements = req.read().splitlines()


--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -147,6 +147,7 @@ set (SRC
    pass/nop_elimination.cpp
    pass/pass.cpp
    pass/pass_config.cpp 
+    pass/prefix_reshape_elimination.cpp 
    pass/propagate_cacheability.cpp
    pass/reshape_elimination.cpp
    pass/reshape_sinking.cpp

--- a/src/ngraph/frontend/onnx_import/onnx.cpp
+++ b/src/ngraph/frontend/onnx_import/onnx.cpp
@@ -99,6 +99,14 @@ namespace ngraph
            return op_list;
        }

+        bool is_operator_supported(const std::string& op_name,
+                                   std::int64_t version,
+                                   const std::string& domain)
+        {
+            return OperatorsBridge::is_operator_registered(
+                op_name, version, domain == "ai.onnx" ? "" : domain);
+        }
+
    } // namespace onnx_import

 } // namespace ngraph
--- a/src/ngraph/frontend/onnx_import/onnx.hpp
+++ b/src/ngraph/frontend/onnx_import/onnx.hpp
@@ -52,6 +52,18 @@ namespace ngraph
        std::set<std::string> get_supported_operators(std::int64_t version,
                                                      const std::string& domain);

+        /// \brief      Determines whether ONNX operator is supported.
+        ///
+        /// \param[in]  op_name  The ONNX operator name.
+        /// \param[in]  version  The ONNX operator set version.
+        /// \param[in]  domain   The domain the ONNX operator is registered to.
+        ///
+        /// \return     True if operator is supported, False otherwise.
+        ///
+        bool is_operator_supported(const std::string& op_name,
+                                   std::int64_t version,
+                                   const std::string& domain = "ai.onnx");
+
        /// \brief Convert an ONNX model to nGraph function
        /// The function translated serialized ONNX model to nGraph function. The serialized
        /// ONNX model is read from input stream.

--- a/src/ngraph/frontend/onnx_import/ops_bridge.cpp
+++ b/src/ngraph/frontend/onnx_import/ops_bridge.cpp
@@ -21,6 +21,7 @@
 #include <unordered_map>

 #include "core/attribute.hpp"
+#include "ngraph/log.hpp"
 #include "op/abs.hpp"
 #include "op/acos.hpp"
 #include "op/add.hpp"
@@ -102,20 +103,19 @@ namespace ngraph
    {
        namespace detail
        {
-            const Operator& find(const std::string& name,
-                                 std::int64_t version,
-                                 const std::string& domain,
-                                 const std::map<std::int64_t, Operator>& map)
+            const std::map<std::int64_t, Operator>::const_iterator
+                find(std::int64_t version, const std::map<std::int64_t, Operator>& map)
            {
+                std::map<std::int64_t, Operator>::const_iterator it{};
                while (version > 0)
                {
-                    const auto it = map.find(version--);
+                    it = map.find(version--);
                    if (it != std::end(map))
                    {
-                        return it->second;
+                        return it;
                    }
                }
-                throw error::UnsupportedVersion{name, version, domain};
+                return it;
            }
        }

@@ -136,13 +136,51 @@ namespace ngraph
            {
                throw error::UnknownDomain{domain};
            }
+            if (version > OperatorsBridge::LATEST_SUPPORTED_OPSET_VERSION)
+            {
+                NGRAPH_WARN << "Currently operator set version: " << version << " is unsupported."
+                            << " Falling back to: "
+                            << OperatorsBridge::LATEST_SUPPORTED_OPSET_VERSION;
+            }
            for (const auto& op : dm->second)
            {
-                result.emplace(op.first, detail::find(op.first, version, domain, op.second));
+                const auto& it = detail::find(version, op.second);
+                if (it == std::end(op.second))
+                {
+                    throw error::UnsupportedVersion{op.first, version, domain};
+                }
+                result.emplace(op.first, it->second);
            }
            return result;
        }

+        bool OperatorsBridge::_is_operator_registered(const std::string& name,
+                                                      std::int64_t version,
+                                                      const std::string& domain)
+        {
+            // search for domain
+            auto dm_map = m_map.find(domain);
+            if (dm_map == std::end(m_map))
+            {
+                return false;
+            }
+            // search for name
+            auto op_map = dm_map->second.find(name);
+            if (op_map == std::end(dm_map->second))
+            {
+                return false;
+            }
+
+            if (detail::find(version, op_map->second) != std::end(op_map->second))
+            {
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
 #define REGISTER_OPERATOR(name_, ver_, fn_)                                                        \
    m_map[""][name_].emplace(ver_, std::bind(op::set_##ver_::fn_, std::placeholders::_1))


--- a/src/ngraph/frontend/onnx_import/ops_bridge.hpp
+++ b/src/ngraph/frontend/onnx_import/ops_bridge.hpp
@@ -62,6 +62,8 @@ namespace ngraph
        class OperatorsBridge
        {
        public:
+            static constexpr const int LATEST_SUPPORTED_OPSET_VERSION = ONNX_OPSET_VERSION;
+
            OperatorsBridge(const OperatorsBridge&) = delete;
            OperatorsBridge& operator=(const OperatorsBridge&) = delete;
            OperatorsBridge(OperatorsBridge&&) = delete;
@@ -80,6 +82,13 @@ namespace ngraph
                instance()._register_operator(name, version, domain, std::move(fn));
            }

+            static bool is_operator_registered(const std::string& name,
+                                               std::int64_t version,
+                                               const std::string& domain)
+            {
+                return instance()._is_operator_registered(name, version, domain);
+            }
+
        private:
            std::unordered_map<std::string,
                               std::unordered_map<std::string, std::map<std::int64_t, Operator>>>
@@ -98,6 +107,9 @@ namespace ngraph
                                    const std::string& domain,
                                    Operator fn);
            OperatorSet _get_operator_set(std::int64_t version, const std::string& domain);
+            bool _is_operator_registered(const std::string& name,
+                                         std::int64_t version,
+                                         const std::string& domain);
        };

    } // namespace onnx_import

--- a/src/ngraph/log.cpp
+++ b/src/ngraph/log.cpp
@@ -30,7 +30,7 @@ using namespace ngraph;

 void ngraph::default_logger_handler_func(const string& s)
 {
-    cout << s << endl;
+    cout << s + "\n";
 }

 LogHelper::LogHelper(LOG_TYPE type,

--- a/src/ngraph/runtime/plaidml/plaidml_pass_reshape_elision.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_reshape_elision.cpp
@@ -14,7 +14,7 @@
 // limitations under the License.
 //*****************************************************************************

-#include "ngraph/runtime/plaidml/plaidml_pass_reshape_elision.hpp"
+#include "ngraph/pass/prefix_reshape_elimination.hpp"
 #include "ngraph/graph_util.hpp"
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
@@ -24,7 +24,7 @@
 #include "ngraph/pattern/op/any_of.hpp"
 #include "ngraph/pattern/op/label.hpp"

-ngraph::runtime::plaidml::pass::ReshapeElision::ReshapeElision()
+ngraph::pass::PrefixReshapeElimination::PrefixReshapeElimination()
 {
    auto src_op = std::make_shared<pattern::op::Label>(
        element::i8, Shape{}, [](std::shared_ptr<Node>) { return true; });
@@ -39,13 +39,10 @@ ngraph::runtime::plaidml::pass::ReshapeElision::ReshapeElision()
            }

            // Validate that this isn't a reordering-reshape.
-            for (std::size_t idx = 0; idx < reshape->get_input_order().size(); ++idx)
-            {
-                if (idx != reshape->get_input_order().at(idx))
+            if (reshape->get_is_transpose())
            {
                return false;
            }
-            }

            // Make sure that logical dimension sizes match.
            const Shape& src_shape = reshape->get_input_shape(0);

--- a/src/ngraph/pass/prefix_reshape_elimination.hpp
+++ b/src/ngraph/pass/prefix_reshape_elimination.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/pass/graph_rewrite.hpp"
+
+namespace ngraph
+{
+    namespace pass
+    {
+        class PrefixReshapeElimination;
+    }
+}
+
+// A pass to eliminate reshapes whose output shapes are the same as
+// their input shape modulo leading size-1 axes.
+//
+// N.B. This pass MUST only be used by backends that can handle the
+//      omission of leading size-1 axes, e.g. backends that implement
+//      NumPy-style broadcast semantics.
+class ngraph::pass::PrefixReshapeElimination final : public ngraph::pass::GraphRewrite
+{
+public:
+    PrefixReshapeElimination();
+};
--- a/src/ngraph/pass/reshape_sinking.hpp
+++ b/src/ngraph/pass/reshape_sinking.hpp
--- a/src/ngraph/runtime/CMakeLists.txt
+++ b/src/ngraph/runtime/CMakeLists.txt
@@ -37,6 +37,10 @@ if (NGRAPH_GPUH_ENABLE)
    add_subdirectory(gpuh)
 endif()

+if (NGRAPH_GENERIC_CPU_ENABLE)
+    add_subdirectory(generic_cpu)
+endif()
+
 if (NGRAPH_PLAIDML_ENABLE)
    add_subdirectory(plaidml)
 endif()
--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -145,3 +145,8 @@ vector<runtime::PerformanceCounter>
    }
    return rc;
 }
+
+bool runtime::cpu::CPU_Backend::is_supported(const Node& op) const
+{
+    return true;
+}
--- a/src/ngraph/runtime/cpu/cpu_backend.hpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.hpp
@@ -58,6 +58,8 @@ namespace ngraph
                std::vector<PerformanceCounter>
                    get_performance_data(std::shared_ptr<Function> func) const override;

+                bool is_supported(const Node& node) const override;
+
            private:
                class FunctionInstance
                {

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -1982,10 +1982,10 @@ void runtime::cpu::CPU_ExternalFunction::build()
                        file_util::path_join(s_debug_dir, m_function_name + "_debug.txt");
                    std::stringstream ss;

-                    ss << "EXECUTION PLAN:\n";
+                    ss << "\nEXECUTION PLAN:\n";
                    for (size_t i = 0; i < functors.size(); i++)
                    {
-                        ss << op_names.at(i) << "will be executed with the following inputs:\n";
+                        ss << op_names.at(i) << " will be executed with the following inputs:\n";
                        for (auto is : this->m_op_attrs.at(i).Inputs)
                        {
                            ss << "\t" << is << " = " << this->get_tensor_data(is) << std::endl;

--- a/src/ngraph/runtime/generic_cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/generic_cpu/CMakeLists.txt
+# ******************************************************************************
+# Copyright 2017-2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+
+if (NGRAPH_GENERIC_CPU_ENABLE)
+    find_package(OpenMP)
+    if (OPENMP_FOUND)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    endif()
+    add_library(gcpu_backend SHARED gcpu_backend.cpp node_wrapper.cpp)
+    if(NGRAPH_LIB_VERSIONING_ENABLE)
+        set_target_properties(gcpu_backend PROPERTIES
+            VERSION ${NGRAPH_VERSION}
+            SOVERSION ${NGRAPH_API_VERSION})
+    endif()
+    target_link_libraries(gcpu_backend PRIVATE ngraph libeigen hybrid_base interpreter_backend)
+    target_compile_options(gcpu_backend PUBLIC -fopenmp)
+    set_target_properties(gcpu_backend PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})
+
+    install(TARGETS gcpu_backend
+        LIBRARY DESTINATION "${NGRAPH_INSTALL_LIB}"
+        ARCHIVE DESTINATION "${NGRAPH_INSTALL_LIB}"
+    )
+endif()
--- a/src/ngraph/runtime/generic_cpu/gcpu_backend.cpp
+++ b/src/ngraph/runtime/generic_cpu/gcpu_backend.cpp
--- a/src/ngraph/runtime/generic_cpu/gcpu_backend.hpp
+++ b/src/ngraph/runtime/generic_cpu/gcpu_backend.hpp
--- a/src/ngraph/runtime/generic_cpu/kernel/broadcast.hpp
+++ b/src/ngraph/runtime/generic_cpu/kernel/broadcast.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <Eigen/Dense>
+#include <cmath>
+#include <omp.h>
+#include <utility>
+
+#include "ngraph/runtime/reference/broadcast.hpp"
+#include "ngraph/shape_util.hpp"
+#include "ngraph/util.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gcpu
+        {
+            namespace kernel
+            {
+                std::tuple<size_t, size_t> get_start_finish(size_t size)
+                {
+                    const size_t nthreads = omp_get_num_threads();
+                    const size_t ithread = omp_get_thread_num();
+                    const size_t start = ithread * size / nthreads;
+                    const size_t finish = (ithread + 1) * size / nthreads;
+                    return std::make_tuple(start, finish);
+                }
+                template <typename T>
+                void broadcast_2d(const T* in,
+                                  T* out,
+                                  const Shape& in_shape,
+                                  const Shape& out_shape,
+                                  const AxisSet& broadcast_axes)
+                {
+                    size_t index[2];
+                    size_t* out_index =
+                        (broadcast_axes.find(0) == broadcast_axes.end() ? &index[0] : &index[1]);
+                    for (index[0] = 0; index[0] < out_shape[0]; ++index[0])
+                    {
+                        for (index[1] = 0; index[1] < out_shape[1]; ++index[1])
+                        {
+                            out[index[0] * out_shape[1] + index[1]] = in[*out_index];
+                        }
+                    }
+                }
+
+                // #define PARALLEL
+                template <typename T>
+                void broadcast_3d(const T* in,
+                                  T* out,
+                                  const Shape& in_shape,
+                                  const Shape& out_shape,
+                                  const AxisSet& broadcast_axes)
+                {
+#ifdef PARALLEL
+#pragma omp parallel
+#endif
+                    {
+                        size_t start;
+                        size_t finish;
+#ifdef PARALLEL
+                        std::tie(start, finish) = get_start_finish(out_shape[0]);
+#else
+                        start = 0;
+                        finish = out_shape[0];
+#endif
+                        size_t index[3];
+                        size_t* out_index = 0;
+                        for (size_t i = 0; i < 3; i++)
+                        {
+                            if (broadcast_axes.count(i) == 0)
+                            {
+                                out_index = &index[i];
+                                break;
+                            }
+                        }
+                        for (index[0] = start; index[0] < finish; ++index[0])
+                        {
+                            for (index[1] = 0; index[1] < out_shape[1]; ++index[1])
+                            {
+                                for (index[2] = 0; index[2] < out_shape[2]; ++index[2])
+                                {
+                                    out[index[0] * out_shape[1] * out_shape[2] +
+                                        index[1] * out_shape[2] + index[2]] = in[*out_index];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                template <typename T>
+                void broadcast_4d(const T* in,
+                                  T* out,
+                                  const Shape& in_shape,
+                                  const Shape& out_shape,
+                                  const AxisSet& broadcast_axes)
+                {
+                    size_t index[4];
+                    size_t* out_index = 0;
+                    for (size_t i = 0; i < 4; i++)
+                    {
+                        if (broadcast_axes.count(i) == 0)
+                        {
+                            out_index = &index[i];
+                            break;
+                        }
+                    }
+                    for (index[0] = 0; index[0] < out_shape[0]; ++index[0])
+                    {
+                        for (index[1] = 0; index[1] < out_shape[1]; ++index[1])
+                        {
+                            for (index[2] = 0; index[2] < out_shape[2]; ++index[2])
+                            {
+                                for (index[3] = 0; index[3] < out_shape[3]; ++index[3])
+                                {
+                                    out[index[0] * out_shape[1] * out_shape[2] * out_shape[3] +
+                                        index[1] * out_shape[2] * out_shape[3] +
+                                        index[2] * out_shape[3] + index[3]] = in[*out_index];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                template <typename T>
+                void broadcast(const T* in,
+                               T* out,
+                               const Shape& in_shape,
+                               const Shape& out_shape,
+                               const AxisSet& broadcast_axes)
+                {
+                    if (in_shape.size() == 0)
+                    {
+                        for (size_t i = 0; i < shape_size(out_shape); ++i)
+                        {
+                            out[i] = in[0];
+                        }
+                    }
+                    else if (in_shape.size() == 1)
+                    {
+                        switch (out_shape.size())
+                        {
+                        case 2:
+                            broadcast_2d<T>(in, out, in_shape, out_shape, broadcast_axes);
+                            break;
+                        case 3:
+                            broadcast_3d<T>(in, out, in_shape, out_shape, broadcast_axes);
+                            break;
+                        case 4:
+                            broadcast_4d<T>(in, out, in_shape, out_shape, broadcast_axes);
+                            break;
+                        }
+                    }
+                    else
+                    {
+                        runtime::reference::broadcast<T>(
+                            in, out, in_shape, out_shape, broadcast_axes);
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/generic_cpu/kernel/dot.hpp
+++ b/src/ngraph/runtime/generic_cpu/kernel/dot.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <Eigen/Dense>
+#include <cmath>
+#include <omp.h>
+#include <utility>
+
+#include "ngraph/coordinate_transform.hpp"
+#include "ngraph/shape_util.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gcpu
+        {
+            namespace kernel
+            {
+                template <typename T>
+                void dot(const T* arg0,
+                         const T* arg1,
+                         T* out,
+                         const Shape& arg0_shape,
+                         const Shape& arg1_shape,
+                         const Shape& out_shape,
+                         size_t reduction_axes_count)
+                {
+                    if (arg0_shape.size() == 2 && arg1_shape.size() == 2 && out_shape.size() == 2)
+                    {
+                        Eigen::Map<
+                            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+                            a0(const_cast<T*>(arg0), arg0_shape[0], arg0_shape[1]);
+                        Eigen::Map<
+                            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+                            a1(const_cast<T*>(arg1), arg1_shape[0], arg1_shape[1]);
+                        Eigen::Map<
+                            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+                            o(const_cast<T*>(out), out_shape[0], out_shape[1]);
+                        o = a0 * a1;
+                    }
+                    else
+                    {
+                        // Get the sizes of the dot axes. It's easiest to pull them from arg1 because they're
+                        // right up front.
+                        Shape dot_axis_sizes(reduction_axes_count);
+                        std::copy(arg1_shape.begin(),
+                                  arg1_shape.begin() + reduction_axes_count,
+                                  dot_axis_sizes.begin());
+
+                        CoordinateTransform arg0_transform(arg0_shape);
+                        CoordinateTransform arg1_transform(arg1_shape);
+                        CoordinateTransform output_transform(out_shape);
+
+                        // Create coordinate transforms for arg0 and arg1 that throw away the dotted axes.
+                        size_t arg0_projected_rank = arg0_shape.size() - reduction_axes_count;
+                        size_t arg1_projected_rank = arg1_shape.size() - reduction_axes_count;
+
+                        Shape arg0_projected_shape(arg0_projected_rank);
+                        std::copy(arg0_shape.begin(),
+                                  arg0_shape.begin() + arg0_projected_rank,
+                                  arg0_projected_shape.begin());
+
+                        Shape arg1_projected_shape(arg1_projected_rank);
+                        std::copy(arg1_shape.begin() + reduction_axes_count,
+                                  arg1_shape.end(),
+                                  arg1_projected_shape.begin());
+
+                        CoordinateTransform arg0_projected_transform(arg0_projected_shape);
+                        CoordinateTransform arg1_projected_transform(arg1_projected_shape);
+
+                        // Create a coordinate transform that allows us to iterate over all possible values
+                        // for the dotted axes.
+                        CoordinateTransform dot_axes_transform(dot_axis_sizes);
+
+                        for (const Coordinate& arg0_projected_coord : arg0_projected_transform)
+                        {
+                            for (const Coordinate& arg1_projected_coord : arg1_projected_transform)
+                            {
+                                // The output coordinate is just the concatenation of the projected coordinates.
+                                Coordinate out_coord(arg0_projected_coord.size() +
+                                                     arg1_projected_coord.size());
+
+                                auto out_coord_it = std::copy(arg0_projected_coord.begin(),
+                                                              arg0_projected_coord.end(),
+                                                              out_coord.begin());
+                                std::copy(arg1_projected_coord.begin(),
+                                          arg1_projected_coord.end(),
+                                          out_coord_it);
+
+                                // Zero out to start the sum.
+                                T sum = 0;
+
+                                size_t out_index = output_transform.index(out_coord);
+
+                                // Walk along the dotted axes.
+                                Coordinate arg0_coord(arg0_shape.size());
+                                Coordinate arg1_coord(arg1_shape.size());
+                                auto arg0_it = std::copy(arg0_projected_coord.begin(),
+                                                         arg0_projected_coord.end(),
+                                                         arg0_coord.begin());
+                                for (const Coordinate& dot_axis_positions : dot_axes_transform)
+                                {
+                                    // In order to find the points to multiply together, we need to inject our current
+                                    // positions along the dotted axes back into the projected arg0 and arg1 coordinates.
+                                    std::copy(dot_axis_positions.begin(),
+                                              dot_axis_positions.end(),
+                                              arg0_it);
+
+                                    auto arg1_it = std::copy(dot_axis_positions.begin(),
+                                                             dot_axis_positions.end(),
+                                                             arg1_coord.begin());
+                                    std::copy(arg1_projected_coord.begin(),
+                                              arg1_projected_coord.end(),
+                                              arg1_it);
+
+                                    // Multiply and add to the sum.
+                                    sum += arg0[arg0_transform.index(arg0_coord)] *
+                                           arg1[arg1_transform.index(arg1_coord)];
+                                }
+
+                                // Write the sum back.
+                                out[out_index] = sum;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/generic_cpu/kernel/reshape.hpp
+++ b/src/ngraph/runtime/generic_cpu/kernel/reshape.hpp
--- a/src/ngraph/runtime/plaidml/plaidml_pass_reshape_elision.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_reshape_elision.hpp
@@ -16,27 +16,26 @@

 #pragma once

-#include "ngraph/pass/graph_rewrite.hpp"
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <vector>
+#include "ngraph/shape.hpp"

 namespace ngraph
 {
    namespace runtime
    {
-        namespace plaidml
+        namespace gcpu
        {
-            namespace pass
+            namespace kernel
            {
-                class ReshapeElision;
+                template <typename T>
+                void result(const T* arg, T* out, size_t count)
+                {
+                    memcpy(out, arg, sizeof(T) * count);
+                }
            }
        }
    }
 }
-
-// A minor pass to elide unnecessary reshapes.  A reshape is
-// considered unnecessary if its output shape is the same as its input
-// shape, modulo leading size-1 axes.
-class ngraph::runtime::plaidml::pass::ReshapeElision final : public ngraph::pass::GraphRewrite
-{
-public:
-    ReshapeElision();
-};
--- a/src/ngraph/runtime/generic_cpu/node_wrapper.cpp
+++ b/src/ngraph/runtime/generic_cpu/node_wrapper.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "ngraph/runtime/generic_cpu/node_wrapper.hpp"
+
+using namespace ngraph;
+using namespace std;
+
+runtime::gcpu::NodeWrapper::NodeWrapper(const shared_ptr<const Node>& node)
+    : m_node{node}
+{
+// This expands the op list in op_tbl.hpp into a list of enumerations that look like this:
+// {"Abs", runtime::gcpu::OP_TYPEID::Abs},
+// {"Acos", runtime::gcpu::OP_TYPEID::Acos},
+// ...
+#define NGRAPH_OP(a, b) {#a, runtime::gcpu::OP_TYPEID::a},
+    static unordered_map<string, runtime::gcpu::OP_TYPEID> typeid_map{
+#include "ngraph/op/op_tbl.hpp"
+    };
+#undef NGRAPH_OP
+
+    auto it = typeid_map.find(m_node->description());
+    if (it != typeid_map.end())
+    {
+        m_typeid = it->second;
+    }
+    else
+    {
+        throw unsupported_op("Unsupported op '" + m_node->description() + "'");
+    }
+}
--- a/src/ngraph/runtime/generic_cpu/node_wrapper.hpp
+++ b/src/ngraph/runtime/generic_cpu/node_wrapper.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <memory>
+
+#include "ngraph/node.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gcpu
+        {
+            enum class OP_TYPEID;
+            class NodeWrapper;
+        }
+    }
+}
+
+// This expands the op list in op_tbl.hpp into a list of enumerations that look like this:
+// Abs,
+// Acos,
+// ...
+#define NGRAPH_OP(a, b) a,
+enum class ngraph::runtime::gcpu::OP_TYPEID
+{
+#include "ngraph/op/op_tbl.hpp"
+};
+#undef NGRAPH_OP
+
+/// \brief This class allows adding an enum typeid to each Node. This makes dealing with
+/// collections of Nodes a little easier and faster as we can use switch() instead of
+/// if/else statements
+class ngraph::runtime::gcpu::NodeWrapper
+{
+public:
+    NodeWrapper(const std::shared_ptr<const ngraph::Node>& node);
+
+    const Node& get_node() const { return *m_node; }
+    ngraph::runtime::gcpu::OP_TYPEID get_typeid() const { return m_typeid; }
+private:
+    std::shared_ptr<const ngraph::Node> m_node;
+    OP_TYPEID m_typeid;
+};
--- a/src/ngraph/runtime/generic_cpu/unit_test.manifest
+++ b/src/ngraph/runtime/generic_cpu/unit_test.manifest
+
--- a/src/ngraph/runtime/gpu/gpu_compiled_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_compiled_function.cpp
@@ -34,6 +34,7 @@
 #include "ngraph/node.hpp"
 #include "ngraph/pass/algebraic_simplification.hpp"
 #include "ngraph/pass/any_all_replacement.hpp"
+#include "ngraph/pass/get_output_element_elimination.hpp"
 #include "ngraph/pass/like_replacement.hpp"

 #include "ngraph/runtime/gpu/gpu_backend.hpp"
@@ -173,6 +174,7 @@ void runtime::gpu::GPUCompiledFunction::compile()
    pass_manager.register_pass<ngraph::pass::LikeReplacement>();
    pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
    pass_manager.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
+    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(get_memory_alignment());
    pass_manager.register_pass<runtime::gpu::pass::TensorMemoryReservation>(

--- a/src/ngraph/runtime/hybrid/hybrid_backend.cpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.cpp
@@ -18,13 +18,10 @@
 #include "ngraph/graph_util.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
-#include "ngraph/runtime/gpu/gpu_backend.hpp"
-#include "ngraph/runtime/gpu/gpu_tensor.hpp"
 #include "ngraph/runtime/host_tensor.hpp"
 #include "ngraph/runtime/hybrid/hybrid_util.hpp"
 #include "ngraph/runtime/hybrid/pass/assign_placement.hpp"
 #include "ngraph/runtime/hybrid/pass/fix_get_output_element.hpp"
-#include "ngraph/runtime/interpreter/int_backend.hpp"
 #include "ngraph/runtime/tensor.hpp"

 using namespace ngraph;
@@ -205,32 +202,6 @@ bool runtime::hybrid::HybridBackend::is_supported(const Node& node) const
    return true;
 }

-string runtime::hybrid::HybridBackend::get_placement_name(const runtime::Tensor* t)
-{
-    string rc;
-    if (dynamic_cast<const runtime::HostTensor*>(t) != nullptr)
-    {
-        rc = "HostTensor";
-    }
-    else if (dynamic_cast<const runtime::gpu::GPUTensor*>(t) != nullptr)
-    {
-        rc = "GPUTensor";
-    }
-    return rc;
-}
-string runtime::hybrid::HybridBackend::get_placement_name(const runtime::Backend* t)
-{
-    string rc;
-    if (dynamic_cast<const runtime::interpreter::INTBackend*>(t) != nullptr)
-    {
-        rc = "INTBackend";
-    }
-    else if (dynamic_cast<const runtime::gpu::GPU_Backend*>(t) != nullptr)
-    {
-        rc = "GPU_Backend";
-    }
-    return rc;
-}
 size_t runtime::hybrid::HybridBackend::get_placement(const runtime::Tensor* t)
 {
    size_t index = 0;

--- a/src/ngraph/runtime/hybrid/hybrid_backend.hpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.hpp
@@ -70,7 +70,5 @@ private:
    std::map<std::shared_ptr<ngraph::Function>, FunctionInstance> m_function_map;
    std::vector<std::shared_ptr<runtime::Backend>> m_backend_list;

-    std::string get_placement_name(const runtime::Tensor* t);
-    std::string get_placement_name(const runtime::Backend* t);
    size_t get_placement(const runtime::Tensor* t);
 };
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -194,6 +194,8 @@ static void do_eltwise_operation(cldnn::topology& topology,
 {
    arguments_check(op, 2, 1);

+// Leave it here for some time
+#if USE_INTELGPU_CUSTOM_KERNELS
    if ((get_input_type(op) == element::i32 || get_input_type(op) == element::i64) &&
        (mode == cldnn::eltwise_mode::min || mode == cldnn::eltwise_mode::max))
    {
@@ -229,6 +231,12 @@ static void do_eltwise_operation(cldnn::topology& topology,
            get_output_name(op), {get_input_name(op, 0), get_input_name(op, 1)}, mode);
        topology.add(op_add);
    }
+#else
+
+    const cldnn::eltwise op_eltwise(
+        get_output_name(op), {get_input_name(op, 0), get_input_name(op, 1)}, mode);
+    topology.add(op_eltwise);
+#endif
 }

 static void do_unary_operation(cldnn::topology& topology,
@@ -382,7 +390,14 @@ runtime::intelgpu::IntelGPUBackend::IntelGPUBackend()
        m_function_cache_disabled = true;
    }

-    cldnn::engine_configuration cldnn_configuration(profiling);
+    cldnn::engine_configuration cldnn_configuration(profiling,
+                                                    false,
+                                                    m_cldnn_dump_enable,
+                                                    string(),
+                                                    string(),
+                                                    true,
+                                                    string(),
+                                                    m_cldnn_dump_dir);
    ocl_engine = make_shared<cldnn::engine>(cldnn_configuration);
 }

@@ -411,6 +426,14 @@ runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function>

    set<cldnn::primitive_id> func_output_names;
    cldnn::topology topology;
+    stopwatch timer_compile;
+    double mem_before_compile = 0.0;
+
+    if (m_profile_enable)
+    {
+        mem_before_compile = get_max_memory_rss();
+        timer_compile.start();
+    }

    if (m_dump_graph_enable)
    {
@@ -1800,6 +1823,13 @@ runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function>
    instance.ocl_network =
        make_shared<cldnn::network>(*ocl_engine, topology, network_build_options);

+    if (m_profile_enable)
+    {
+        timer_compile.stop();
+        instance.m_compilation_time = timer_compile.get_milliseconds();
+        instance.m_consumed_memory = get_max_memory_rss() - mem_before_compile;
+    }
+
    return func;
 }

@@ -1807,17 +1837,8 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
                                              const vector<shared_ptr<runtime::Tensor>>& outputs,
                                              const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
-    double mem_before_call = 0.0f;
-    double mem_after_compilation = 0.0f;
-    double mem_after_call = 0.0f;
+    double mem_call_consumed = 0.0f;
    stopwatch timer_call;
-    stopwatch timer_compile;
-
-    if (m_profile_enable)
-    {
-        mem_before_call = get_max_memory_rss();
-        timer_compile.start();
-    }

    FunctionInstance& instance = ocl_networks[func];
    if (instance.ocl_network == nullptr)
@@ -1827,8 +1848,7 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,

    if (m_profile_enable)
    {
-        timer_compile.stop();
-        mem_after_compilation = get_max_memory_rss();
+        mem_call_consumed = get_max_memory_rss();
        timer_call.start();
    }

@@ -1876,15 +1896,18 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
    if (m_profile_enable)
    {
        timer_call.stop();
-        mem_after_call = get_max_memory_rss();
+        mem_call_consumed = get_max_memory_rss() - mem_call_consumed;

        print_call_performance(network,
                               func,
-                               timer_compile.get_milliseconds(),
+                               instance.m_compilation_time,
                               timer_call.get_milliseconds(),
-                               mem_before_call,
-                               mem_after_compilation,
-                               mem_after_call);
+                               instance.m_consumed_memory,
+                               mem_call_consumed,
+                               get_max_memory_rss());
+
+        // Output compile time only once
+        instance.m_compilation_time = 0.0;
    }

    if (m_function_cache_disabled)
@@ -1987,11 +2010,11 @@ static Node* get_node_by_name(const shared_ptr<Function> func, const string& nam
 void runtime::intelgpu::IntelGPUBackend::print_call_performance(
    const shared_ptr<cldnn::network> network,
    const shared_ptr<Function> func,
-    size_t time_compile,
-    size_t time_call,
-    double mem_before_call,
-    double mem_after_compilation,
-    double mem_after_call) const
+    double time_compile,
+    double time_call,
+    double mem_compilation_consumed,
+    double mem_call_consumed,
+    double mem_current) const
 {
    struct data_item
    {
@@ -2102,10 +2125,10 @@ void runtime::intelgpu::IntelGPUBackend::print_call_performance(
    }

    // Print time and memory consumed in ::call function
-    cout << func_name << delim << " Backend compilation(ms)" << delim << time_compile << " call(ms)"
-         << delim << time_call << delim << "memory before call(B)" << delim << mem_before_call
-         << delim << "after compilation(B)" << delim << mem_after_compilation << delim
-         << "after call(B)" << delim << mem_after_call << endl;
+    cout << func_name << delim << " Backend compilation(ms)" << delim << time_compile << delim
+         << "call(ms)" << delim << time_call << delim << "memory consumption compile(B)" << delim
+         << mem_compilation_consumed << delim << "call(B)" << delim << mem_call_consumed << delim
+         << "RSS(B)" << delim << mem_current << endl;

    cout.flags(saved_stream_flags); // Restore stream configuration to leave it in original state
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
@@ -64,6 +64,8 @@ private:
    public:
        std::shared_ptr<cldnn::network> ocl_network = nullptr;
        bool m_performance_counters_enabled = false;
+        double m_compilation_time = 0.0;
+        double m_consumed_memory = 0.0;
    };

    std::map<std::shared_ptr<Function>, FunctionInstance> ocl_networks;
@@ -74,11 +76,11 @@ private:
    // Statistic related things
    void print_call_performance(const std::shared_ptr<cldnn::network> network,
                                const std::shared_ptr<Function> func,
-                                size_t time_compile,
-                                size_t time_call,
-                                double mem_before_call,
-                                double mem_after_compilation,
-                                double mem_after_call) const;
+                                double time_compile,
+                                double time_call,
+                                double mem_compilation_consumed,
+                                double mem_call_consumed,
+                                double mem_current) const;

    bool m_profile_enable = false;
    long m_profile_lines_limit_count = 10;

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -20,6 +20,7 @@ batch_norm_inference_f64
 batch_norm_training_0eps_f64
 batch_norm_one_output
 batch_norm_three_outputs
+batch_norm_bprop_n4c3h2w2
 dequantize
 dequantize_axes
 dequantize_int32

--- a/src/ngraph/runtime/plaidml/CMakeLists.txt
+++ b/src/ngraph/runtime/plaidml/CMakeLists.txt
@@ -54,7 +54,6 @@ set(SRC
    plaidml_pass_lower_convolutions.cpp
    plaidml_pass_replicate_combination.cpp
    plaidml_pass_replicate_elision.cpp
-    plaidml_pass_reshape_elision.cpp
    plaidml_pass_winograd.cpp
    plaidml_tensor.cpp
    plaidml_translate.cpp

--- a/src/ngraph/runtime/plaidml/plaidml_backend.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_backend.cpp
@@ -31,14 +31,15 @@ ngraph::runtime::plaidml::PlaidML_Backend::PlaidML_Backend(const char* configura
 std::shared_ptr<ngraph::runtime::Tensor> ngraph::runtime::plaidml::PlaidML_Backend::create_tensor(
    const ngraph::element::Type& element_type, const ngraph::Shape& shape)
 {
-    return std::make_shared<PlaidML_Tensor>(&m_config, element_type, shape, "direct_data", nullptr);
+    return std::make_shared<PlaidML_Tensor>(
+        this, &m_config, element_type, shape, "direct_data", nullptr);
 }

 std::shared_ptr<ngraph::runtime::Tensor> ngraph::runtime::plaidml::PlaidML_Backend::create_tensor(
    const ngraph::element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
    return std::make_shared<PlaidML_Tensor>(
-        &m_config, element_type, shape, "direct_data", memory_pointer);
+        this, &m_config, element_type, shape, "direct_data", memory_pointer);
 }

 std::shared_ptr<ngraph::Function>

--- a/src/ngraph/runtime/plaidml/plaidml_compiler.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_compiler.cpp
@@ -26,6 +26,7 @@
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/nop_elimination.hpp"
+#include "ngraph/pass/prefix_reshape_elimination.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
 #include "ngraph/pass/zero_dim_tensor_elimination.hpp"
 #include "ngraph/runtime/plaidml/plaidml_impl.hpp"
@@ -36,7 +37,6 @@
 #include "ngraph/runtime/plaidml/plaidml_pass_lower_convolutions.hpp"
 #include "ngraph/runtime/plaidml/plaidml_pass_replicate_combination.hpp"
 #include "ngraph/runtime/plaidml/plaidml_pass_replicate_elision.hpp"
-#include "ngraph/runtime/plaidml/plaidml_pass_reshape_elision.hpp"
 #include "ngraph/runtime/plaidml/plaidml_pass_winograd.hpp"

 namespace
@@ -101,9 +101,9 @@ std::shared_ptr<ngraph::runtime::plaidml::CompiledFunction>
    pass_manager.register_pass<ngraph::runtime::plaidml::pass::ReplicateElision>();
    pass_manager.register_pass<ngraph::runtime::plaidml::pass::ReplicateCombination>();
    pass_manager.register_pass<ngraph::runtime::plaidml::pass::ImplicitBroadcast>();
-    pass_manager.register_pass<ngraph::runtime::plaidml::pass::ReshapeElision>();
+    pass_manager.register_pass<ngraph::pass::PrefixReshapeElimination>();
    pass_manager.register_pass<ngraph::runtime::plaidml::pass::LowerConvolutions>();
-    if (m_config->winograd)
+    if (pass_manager.get_pass_config().get_pass_enable("Winograd"))
    {
        pass_manager.register_pass<ngraph::runtime::plaidml::pass::Winograd>();
    }

--- a/src/ngraph/runtime/plaidml/plaidml_config.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_config.cpp
@@ -77,7 +77,6 @@ ngraph::runtime::plaidml::Config
    bool help = false;
    bool list = false;
    bool debug = false;
-    bool winograd = false;
    std::size_t device_idx = 0;
    std::string eventlog_config;
    std::string graphviz;
@@ -242,14 +241,6 @@ ngraph::runtime::plaidml::Config
            continue;
        }

-        // Check for Winograd.  (Winograd is sometimes a performance
-        // boost, but not always, so we make it optional.)
-        if (is_opt("winograd"))
-        {
-            winograd = true;
-            continue;
-        }
-
        // Reject unknown options
        err = true;
    }
@@ -257,7 +248,7 @@ ngraph::runtime::plaidml::Config
    constexpr char help_text[] =
        "PlaidML Backend Specification: \""
        "PlaidML[:[device_index][,debug][,help][,list_devices][,"
-        "eventlog=<filename>][,graphviz=<filename>][,winograd]]\".  For example: \"PlaidML\", \""
+        "eventlog=<filename>][,graphviz=<filename>]]\".  For example: \"PlaidML\", \""
        "PlaidML:0,list_devices\"";
    if (err)
    {
@@ -292,7 +283,5 @@ ngraph::runtime::plaidml::Config

    result.graphviz = graphviz;

-    result.winograd = winograd;
-
    return result;
 }
--- a/src/ngraph/runtime/plaidml/plaidml_config.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_config.hpp
@@ -39,6 +39,5 @@ struct ngraph::runtime::plaidml::Config
    std::shared_ptr<vertexai::ctx> ctx;
    std::shared_ptr<vertexai::plaidml::device> dev;
    bool debug;
-    bool winograd;
    std::string graphviz;
 };
--- a/src/ngraph/runtime/plaidml/plaidml_impl.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_impl.hpp
@@ -166,7 +166,7 @@ namespace ngraph
                {
                    Impl impl;
                    impl.set_build(build);
-                    impl.set_op(dynamic_cast<const typename Impl::Op*>(op));
+                    impl.set_op(static_cast<const typename Impl::Op*>(op));
                    impl.Apply();
                }
            };

--- a/src/ngraph/runtime/plaidml/plaidml_pass_concat_elision.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_concat_elision.cpp
@@ -33,7 +33,7 @@ ngraph::runtime::plaidml::pass::ConcatElision::ConcatElision()
        });

    pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
-        auto concat = std::dynamic_pointer_cast<ngraph::op::Concat>(m.get_match_root());
+        auto concat = std::static_pointer_cast<ngraph::op::Concat>(m.get_match_root());
        auto args = concat->get_arguments();

        // Elide one-argument concats.

--- a/src/ngraph/runtime/plaidml/plaidml_pass_implicit_broadcast.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_implicit_broadcast.cpp
@@ -45,7 +45,7 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast()
        // for the broadcast axes.
        auto src = m.get_matched_nodes().at(2);
        Shape src_shape = src->get_shape();
-        auto broadcast = std::dynamic_pointer_cast<op::Broadcast>(m.get_matched_nodes().at(1));
+        auto broadcast = std::static_pointer_cast<op::Broadcast>(m.get_matched_nodes().at(1));

        AxisVector reshape_order;
        Shape reshape_shape;

--- a/src/ngraph/runtime/plaidml/plaidml_pass_lower_convolutions.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_lower_convolutions.cpp
@@ -53,9 +53,7 @@ ngraph::runtime::plaidml::pass::LowerConvolutions::LowerConvolutions()
            {
                return reshape->get_input_order();
            }
-            AxisVector result(node->get_shape().size());
-            std::iota(result.begin(), result.end(), 0);
-            return result;
+            return get_default_order(node->get_shape());
        };

        std::shared_ptr<Node> node = m.get_match_root();

--- a/src/ngraph/runtime/plaidml/plaidml_pass_replicate_combination.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_replicate_combination.cpp
@@ -37,8 +37,8 @@ ngraph::runtime::plaidml::pass::ReplicateCombination::ReplicateCombination()

    pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
        auto nodes = m.get_matched_nodes();
-        auto lower = std::dynamic_pointer_cast<plaidml::op::Replicate>(nodes.at(0));
-        auto upper = std::dynamic_pointer_cast<plaidml::op::Replicate>(nodes.at(1));
+        auto lower = std::static_pointer_cast<plaidml::op::Replicate>(nodes.at(0));
+        auto upper = std::static_pointer_cast<plaidml::op::Replicate>(nodes.at(1));
        std::vector<size_t> axes = lower->get_replication_axes();
        const std::vector<size_t>& upper_axes = upper->get_replication_axes();
        auto uit = upper_axes.begin();

--- a/src/ngraph/runtime/plaidml/plaidml_pass_replicate_elision.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_replicate_elision.cpp
@@ -51,7 +51,7 @@ ngraph::runtime::plaidml::pass::ReplicateElision::ReplicateElision()

        for (auto nit = nodes.begin() + 1; nit != nodes.end(); ++nit)
        {
-            auto replicate = std::dynamic_pointer_cast<plaidml::op::Replicate>(*nit);
+            auto replicate = std::static_pointer_cast<plaidml::op::Replicate>(*nit);
            const auto& replicate_axes = replicate->get_replication_axes();
            bool elidable = true;
            for (std::size_t idx = 0; idx < dim_limit; ++idx)

--- a/src/ngraph/runtime/plaidml/plaidml_pass_winograd.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_winograd.cpp
@@ -112,7 +112,7 @@ ngraph::runtime::plaidml::pass::Winograd::Winograd()
        });

    pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
-        auto conv = std::dynamic_pointer_cast<plaidml::op::Convolution>(m.get_match_root());
+        auto conv = std::static_pointer_cast<plaidml::op::Convolution>(m.get_match_root());
        NodeVector args = conv->get_arguments();
        std::shared_ptr<ngraph::op::Constant> a;
        std::shared_ptr<ngraph::op::Constant> b;

--- a/src/ngraph/runtime/plaidml/plaidml_tensor.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_tensor.cpp
@@ -22,12 +22,13 @@

 namespace vp = vertexai::plaidml;

-ngraph::runtime::plaidml::PlaidML_Tensor::PlaidML_Tensor(Config* config,
+ngraph::runtime::plaidml::PlaidML_Tensor::PlaidML_Tensor(Backend* parent,
+                                                         Config* config,
                                                         const ngraph::element::Type& element_type,
                                                         const ngraph::Shape& shape,
                                                         const std::string& name,
                                                         void* memory)
-    : Tensor{std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, name)}
+    : Tensor{std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, name), parent}
    , m_tensor{config->dev->allocate(
          to_plaidml(config->ctx, element_type, shape, ConversionUse::FOR_IO))}
    , m_memory{memory}

--- a/src/ngraph/runtime/plaidml/plaidml_tensor.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_tensor.hpp
@@ -35,7 +35,8 @@ namespace ngraph
 class ngraph::runtime::plaidml::PlaidML_Tensor final : public ngraph::runtime::Tensor
 {
 public:
-    PlaidML_Tensor(Config* config,
+    PlaidML_Tensor(Backend* parent,
+                   Config* config,
                   const ngraph::element::Type& element_type,
                   const ngraph::Shape& shape,
                   const std::string& name,

--- a/src/tools/nbench/CMakeLists.txt
+++ b/src/tools/nbench/CMakeLists.txt
@@ -45,6 +45,9 @@ endif()
 if (NGRAPH_PLAIDML_ENABLE)
    target_link_libraries(nbench plaidml_backend)
 endif()
+if (NGRAPH_GENERIC_CPU_ENABLE)
+    target_link_libraries(nbench gcpu_backend)
+endif()

 if (NGRAPH_DISTRIBUTED_ENABLE)
    target_compile_definitions(nbench PRIVATE NGRAPH_DISTRIBUTED)

--- a/src/tools/nbench/benchmark.cpp
+++ b/src/tools/nbench/benchmark.cpp
@@ -15,7 +15,9 @@
 //*****************************************************************************

 #include <random>
+#if defined(__x86_64__) || defined(__amd64__)
 #include <xmmintrin.h>
+#endif

 #include "benchmark.hpp"
 #include "ngraph/file_util.hpp"
@@ -107,53 +109,20 @@ void init_real_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
 static void random_init(shared_ptr<runtime::Tensor> tv)
 {
    element::Type et = tv->get_element_type();
-    if (et == element::boolean)
-    {
-        init_int_tv<char>(tv, 0, 1);
-    }
-    else if (et == element::f32)
-    {
-        init_real_tv<float>(tv, -1, 1);
-    }
-    else if (et == element::f64)
-    {
-        init_real_tv<double>(tv, -1, 1);
-    }
-    else if (et == element::i8)
-    {
-        init_int_tv<int8_t>(tv, -1, 1);
-    }
-    else if (et == element::i16)
-    {
-        init_int_tv<int16_t>(tv, -1, 1);
-    }
-    else if (et == element::i32)
-    {
-        init_int_tv<int32_t>(tv, 0, 1);
-    }
-    else if (et == element::i64)
-    {
-        init_int_tv<int64_t>(tv, -1, 1);
-    }
-    else if (et == element::u8)
-    {
-        init_int_tv<uint8_t>(tv, 0, 1);
-    }
-    else if (et == element::u16)
-    {
-        init_int_tv<uint16_t>(tv, 0, 1);
-    }
-    else if (et == element::u32)
-    {
-        init_int_tv<uint32_t>(tv, 0, 1);
-    }
-    else if (et == element::u64)
-    {
-        init_int_tv<uint64_t>(tv, 0, 1);
-    }
-    else
-    {
-        throw runtime_error("unsupported type");
+    switch (et.get_type_enum())
+    {
+    case element::Type_t::boolean: init_int_tv<char>(tv, 0, 1); break;
+    case element::Type_t::f32: init_real_tv<float>(tv, -1, 1); break;
+    case element::Type_t::f64: init_real_tv<double>(tv, -1, 1); break;
+    case element::Type_t::i8: init_int_tv<int8_t>(tv, -1, 1); break;
+    case element::Type_t::i16: init_int_tv<int16_t>(tv, -1, 1); break;
+    case element::Type_t::i32: init_int_tv<int32_t>(tv, 0, 1); break;
+    case element::Type_t::i64: init_int_tv<int64_t>(tv, -1, 1); break;
+    case element::Type_t::u8: init_int_tv<uint8_t>(tv, 0, 1); break;
+    case element::Type_t::u16: init_int_tv<uint16_t>(tv, 0, 1); break;
+    case element::Type_t::u32: init_int_tv<uint32_t>(tv, 0, 1); break;
+    case element::Type_t::u64: init_int_tv<uint64_t>(tv, 0, 1); break;
+    default: throw runtime_error("unsupported type");
    }
 }


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -106,6 +106,10 @@ if (NGRAPH_PLAIDML_ENABLE)
    set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} PlaidML)
 endif()

+if (NGRAPH_GENERIC_CPU_ENABLE)
+    set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} GCPU)
+endif()
+
 add_subdirectory(models)
 add_subdirectory(files)
 add_subdirectory(util)

--- a/test/onnx_import.cpp
+++ b/test/onnx_import.cpp
@@ -1660,7 +1660,7 @@ TEST(onnx, model_argmax_int32)
        std::vector<std::int64_t>{1, 1, 1, 1, 1, 1}};

    std::vector<std::vector<std::int64_t>> outputs{
-        execute<std::int32_t, std::int64_t>(function, inputs, "CPU")};
+        execute<std::int32_t, std::int64_t>(function, inputs, "INTERPRETER")};
    EXPECT_TRUE(test::all_close(expected_output.front(), outputs.front()));
 }

@@ -1675,6 +1675,37 @@ TEST(onnx, model_argmin_int32)
    std::vector<std::vector<std::int64_t>> expected_output{std::vector<std::int64_t>{0, 0, 0, 0}};

    std::vector<std::vector<std::int64_t>> outputs{
-        execute<std::int32_t, std::int64_t>(function, inputs, "CPU")};
+        execute<std::int32_t, std::int64_t>(function, inputs, "INTERPRETER")};
    EXPECT_TRUE(test::all_close(expected_output.front(), outputs.front()));
 }
+
+TEST(onnx, model_is_op_supported)
+{
+    // Simple case
+    EXPECT_TRUE(onnx_import::is_operator_supported("Sum", 1, "ai.onnx"));
+    // With fallback
+    EXPECT_TRUE(onnx_import::is_operator_supported("Sum", 100, "ai.onnx"));
+
+    // Different opset versions
+    EXPECT_TRUE(onnx_import::is_operator_supported("Add", 1, "ai.onnx"));
+    EXPECT_TRUE(onnx_import::is_operator_supported("Add", 7, "ai.onnx"));
+
+    // Default domain name
+    EXPECT_TRUE(onnx_import::is_operator_supported("Sum", 1));
+
+    // Unregistered operator
+    EXPECT_FALSE(onnx_import::is_operator_supported("DummyOp", 1));
+    EXPECT_FALSE(onnx_import::is_operator_supported("DummyOp", 1, "ai.onnx"));
+    EXPECT_FALSE(onnx_import::is_operator_supported("DummyOp", 10, "ai.onnx"));
+
+    // Operator with bad domain name
+    EXPECT_FALSE(onnx_import::is_operator_supported("Sum", 1, "bad.domain"));
+
+    // Registered custom operator
+    onnx_import::register_operator(
+        "AddQ", 1, "com.intel.ai", [](const onnx_import::Node& node) -> NodeVector {
+            NodeVector ng_inputs{node.get_ng_inputs()};
+            return {std::make_shared<ngraph::op::Add>(ng_inputs.at(0), ng_inputs.at(1))};
+        });
+    EXPECT_TRUE(onnx_import::is_operator_supported("AddQ", 1, "com.intel.ai"));
+}