remove non-plaid nvidia GPU backend (#3491)

90a1f581 · Robert Kimball · Scott Cyphers · 11d61848 · 90a1f581 · 90a1f581
Commit 90a1f581 authored Aug 30, 2019 by Robert Kimball Committed by Scott Cyphers Aug 30, 2019
90 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,10 +170,8 @@ option(NGRAPH_TOOLS_ENABLE "Control the building of tool" TRUE)
 option(NGRAPH_CPU_ENABLE "Control the building of the CPU backend" TRUE)
 option(NGRAPH_MLIR_ENABLE "Control the building of MLIR backend" FALSE)
 option(NGRAPH_INTELGPU_ENABLE "Control the building of the Intel GPU backend with clDNN" FALSE)
-option(NGRAPH_GPU_ENABLE "Control the building of the GPU backend" FALSE)
 option(NGRAPH_INTERPRETER_ENABLE "Control the building of the INTERPRETER backend" TRUE)
 option(NGRAPH_NOP_ENABLE "Control the building of the NOP backend" TRUE)
-option(NGRAPH_GPUH_ENABLE "Control the building of the Hybrid GPU backend" FALSE)
 option(NGRAPH_GENERIC_CPU_ENABLE "Enable build nGraph for generic CPU backend" FALSE)
 option(NGRAPH_DEBUG_ENABLE "Enable output for NGRAPH_DEBUG statements" FALSE)
 option(NGRAPH_DEPRECATED_ENABLE "Enable compiler deprecation pragmas for deprecated APIs (recommended only for development use)" FALSE)
@@ -194,8 +192,7 @@ option(NGRAPH_DYNAMIC_COMPONENTS_ENABLE "Enable dynamic loading of components" T

 if (NGRAPH_CPU_ENABLE
        AND
-    ((NOT NGRAPH_GPU_ENABLE) AND (NOT NGRAPH_GPUH_ENABLE)
-        AND (NOT NGRAPH_GENERIC_CPU_ENABLE) AND (NOT NGRAPH_INTELGPU_ENABLE))
+    ((NOT NGRAPH_GENERIC_CPU_ENABLE) AND (NOT NGRAPH_INTELGPU_ENABLE))
   )
    set(NGRAPH_INTEL_CPU_ONLY_ENABLE ON)
 endif()
@@ -224,10 +221,6 @@ if (NGRAPH_DISTRIBUTED_ENABLE)
    endif()
 endif()

-if (NGRAPH_GPUH_ENABLE)
-    set(NGRAPH_GPU_ENABLE TRUE)
-endif()
-
 if (NGRAPH_ONNX_IMPORT_ENABLE)
    option(NGRAPH_USE_SYSTEM_PROTOBUF "Use system provided Protobuf shared object" FALSE)
    option(NGRAPH_ONNXIFI_ENABLE "Enable ONNX Interface for Framework Integration" TRUE)
@@ -250,10 +243,8 @@ NORMALIZE_BOOL(NGRAPH_TOOLS_ENABLE)
 NORMALIZE_BOOL(NGRAPH_CPU_ENABLE)
 NORMALIZE_BOOL(NGRAPH_MLIR_ENABLE)
 NORMALIZE_BOOL(NGRAPH_INTELGPU_ENABLE)
-NORMALIZE_BOOL(NGRAPH_GPU_ENABLE)
 NORMALIZE_BOOL(NGRAPH_INTERPRETER_ENABLE)
 NORMALIZE_BOOL(NGRAPH_NOP_ENABLE)
-NORMALIZE_BOOL(NGRAPH_GPUH_ENABLE)
 NORMALIZE_BOOL(NGRAPH_GENERIC_CPU_ENABLE)
 NORMALIZE_BOOL(NGRAPH_DEBUG_ENABLE)
 NORMALIZE_BOOL(NGRAPH_DEPRECATED_ENABLE)
@@ -277,10 +268,8 @@ message(STATUS "NGRAPH_TOOLS_ENABLE:                  ${NGRAPH_TOOLS_ENABLE}")
 message(STATUS "NGRAPH_CPU_ENABLE:                    ${NGRAPH_CPU_ENABLE}")
 message(STATUS "NGRAPH_MLIR_ENABLE:                   ${NGRAPH_MLIR_ENABLE}")
 message(STATUS "NGRAPH_INTELGPU_ENABLE:               ${NGRAPH_INTELGPU_ENABLE}")
-message(STATUS "NGRAPH_GPU_ENABLE:                    ${NGRAPH_GPU_ENABLE}")
 message(STATUS "NGRAPH_INTERPRETER_ENABLE:            ${NGRAPH_INTERPRETER_ENABLE}")
 message(STATUS "NGRAPH_NOP_ENABLE:                    ${NGRAPH_NOP_ENABLE}")
-message(STATUS "NGRAPH_GPUH_ENABLE:                   ${NGRAPH_GPUH_ENABLE}")
 message(STATUS "NGRAPH_GENERIC_CPU_ENABLE:            ${NGRAPH_GENERIC_CPU_ENABLE}")
 message(STATUS "NGRAPH_DEBUG_ENABLE:                  ${NGRAPH_DEBUG_ENABLE}")
 message(STATUS "NGRAPH_DEPRECATED_ENABLE:             ${NGRAPH_DEPRECATED_ENABLE}")
@@ -554,7 +543,7 @@ if(NGRAPH_INTELGPU_ENABLE)
    include(cmake/external_cldnn.cmake)
 endif()

-if ((NGRAPH_GPU_ENABLE OR NGRAPH_CPU_ENABLE) AND NOT NGRAPH_DEX_ONLY)
+if (NGRAPH_CPU_ENABLE AND NOT NGRAPH_DEX_ONLY)
    set(NGRAPH_CODEGEN_ENABLE TRUE)
 else()
    set(NGRAPH_CODEGEN_ENABLE FALSE)

--- a/contrib/docker/Makefile
+++ b/contrib/docker/Makefile
@@ -83,7 +83,6 @@ endif
 # Configuration for GPU backend in Dockerfiles with "_gpu" suffix
 # The nvidia-docker command must be used for any targets that actually utilize GPU devices
 ifneq ("$(shell echo ${NVIDIA_SMI} | grep nvidia-smi)","")
-    CMAKE_OPTIONS_EXTRA+=-DNGRAPH_GPU_ENABLE=TRUE
    DOCKERFILE=${CPU_DOCKERFILE}_gpu
    DOCKER_CMD=nvidia-docker
    DOCKER_CMD_MESG=GPU appears to be supported on this platform.  Building for GPU and CPU backend support.

--- a/src/ngraph/codegen/CMakeLists.txt
+++ b/src/ngraph/codegen/CMakeLists.txt
@@ -50,13 +50,6 @@ endif()
 list(APPEND HEADER_SEARCH_DEFINES CLANG_BUILTIN_HEADERS_PATH="${CLANG_INCLUDE_DIR}")
 list(APPEND HEADER_SEARCH_DEFINES NGRAPH_HEADERS_PATH="${NGRAPH_INCLUDE_PATH}")

-if(NGRAPH_GPU_ENABLE)
-    find_package(CUDA 8 REQUIRED)
-    find_package(CUDNN 7 REQUIRED)
-    list(APPEND HEADER_SEARCH_DEFINES CUDA_HEADER_PATHS="${CUDA_INCLUDE_DIRS}")
-    list(APPEND HEADER_SEARCH_DEFINES CUDNN_HEADER_PATHS="${CUDNN_INCLUDE_DIRS}")
-endif()
-
 if(NGRAPH_TBB_ENABLE)
    get_target_property(TBB_INCLUDE_DIR libtbb INTERFACE_INCLUDE_DIRECTORIES)
    list(APPEND HEADER_SEARCH_DEFINES TBB_HEADERS_PATH="${TBB_INCLUDE_DIR}")

--- a/src/ngraph/runtime/CMakeLists.txt
+++ b/src/ngraph/runtime/CMakeLists.txt
@@ -27,18 +27,10 @@ if (NGRAPH_INTELGPU_ENABLE)
    add_subdirectory(intelgpu)
 endif()

-if (NGRAPH_GPU_ENABLE)
-    add_subdirectory(gpu)
-endif()
-
 if (NGRAPH_NOP_ENABLE)
    add_subdirectory(nop)
 endif()

-if (NGRAPH_GPUH_ENABLE)
-    add_subdirectory(gpuh)
-endif()
-
 if (NGRAPH_GENERIC_CPU_ENABLE)
    add_subdirectory(generic_cpu)
 endif()

--- a/src/ngraph/runtime/gpu/CMakeLists.txt
+++ b/src/ngraph/runtime/gpu/CMakeLists.txt
-# ******************************************************************************
-# Copyright 2017-2019 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ******************************************************************************
-
-include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})
-
-# Add sources for the GPU backend
-# and all its dependencies
-set(SRC
-    cuda_emitter.cpp
-    cudnn_emitter.cpp
-    cublas_emitter.cpp
-    host_emitter.cpp
-    gpu_backend.cpp
-    gpu_call_frame.cpp
-    gpu_cuda_context_manager.cpp
-    gpu_cuda_function_builder.cpp
-    gpu_cuda_function_pool.cpp
-    gpu_cuda_kernel_builder.cpp
-    gpu_emitter.cpp
-    gpu_compiled_function.cpp
-    gpu_internal_function.cpp
-    gpu_invoke.cpp
-    gpu_kernel_args.cpp
-    gpu_kernel_emitters.cpp
-    gpu_memory_manager.cpp
-    gpu_primitive_emitter.cpp
-    gpu_runtime_constructor.cpp
-    gpu_runtime_context.cpp
-    gpu_tensor_wrapper.cpp
-    gpu_tensor.cpp
-    gpu_util.cpp
-    type_info.cpp
-    pass/gpu_batch_norm_cache.cpp
-    pass/gpu_layout.cpp
-    pass/gpu_rnn_fusion.cpp
-    pass/tensor_memory_reservation.cpp
-    op/batch_norm.cpp
-    op/rnn.cpp
-    )
-
-if (NOT NGRAPH_DEX_ONLY)
-    list(APPEND SRC gpu_external_function.cpp)
-endif()
-
-set(CUDA_INC
-    ${PROJECT_SOURCE_DIR}/src/
-    )
-set(CUDA_SRC
-    nvcc/example.cu.cpp
-    )
-
-if (NGRAPH_GPU_ENABLE)
-    find_package(CUDA 9 QUIET)
-    if (CUDA_FOUND)
-        set(CUDA9_FOUND TRUE)
-        message(STATUS "Found CUDA 9")
-    else()
-        find_package(CUDA 8 REQUIRED)
-    endif()
-
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
-        --compiler-options -fPIC;
-        -arch=sm_30;
-        -gencode=arch=compute_35,code=sm_35;
-        -gencode=arch=compute_50,code=sm_50;
-        -gencode=arch=compute_52,code=sm_52;
-        -gencode=arch=compute_60,code=sm_60;
-        -gencode=arch=compute_61,code=sm_61;
-        -gencode=arch=compute_61,code=compute_61)
-    if (CUDA9_FOUND)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
-            -gencode=arch=compute_62,code=sm_62;
-            -gencode=arch=compute_70,code=sm_70;
-            -gencode=arch=compute_70,code=compute_70)
-    endif()
-
-    set (DO_CUDA_COMPILE FALSE)
-    if (CUDA9_FOUND)
-        if (("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU"))
-            # CUDA 9 supports up to gcc 6.x
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
-                set (DO_CUDA_COMPILE TRUE)
-            else()
-                message(STATUS "NVCC will not be used because CUDA 9 only supports up to gcc 6.x")
-            endif()
-        elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-            # CUDA 9 supports up to clang 3.9
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0)
-                set (DO_CUDA_COMPILE TRUE)
-            else()
-                message(STATUS "NVCC will not be used because CUDA 9 only supports up to clang 3.9")
-            endif()
-        endif()
-    else()
-        # CUDA 8 (minimum version of CUDA we support)
-        if (("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU"))
-            # Current release of CUDA 8 supports up to gcc 5.4
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.5)
-                set (DO_CUDA_COMPILE TRUE)
-            else()
-                message(STATUS "NVCC will not be used because CUDA 8 only supports up to gcc 5.4")
-            endif()
-        elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-            # CUDA 8 supports up to clang 3.8
-            if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.9)
-                set (DO_CUDA_COMPILE TRUE)
-            else()
-                message(STATUS "NVCC will not be used because CUDA 8 only supports up to clang 3.8")
-            endif()
-        endif()
-    endif()
-
-    if (DO_CUDA_COMPILE)
-        if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-            # CUDA_PROPAGATE_HOST_FLAGS is true by default, so disable
-            # clang warnings that are known to flag CUDA code
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
-                --compiler-options -Wno-reserved-id-macro;
-                --compiler-options -Wno-undef;
-                --compiler-options -Wno-old-style-cast;
-                --compiler-options -Wno-deprecated;
-                --compiler-options -Wno-unused-macros;
-                --compiler-options -Wno-used-but-marked-unused)
-        endif()
-
-        message(STATUS "Precompiling static CUDA kernels via NVCC")
-        set_source_files_properties( ${CUDA_SRC} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-        cuda_include_directories(${CUDA_INC})
-        cuda_compile(CUDA_OBJ ${CUDA_SRC} STATIC)
-    else()
-        message(STATUS "Not precompiling static CUDA kernels via NVCC; runtime compilation via NVRTC will be used.")
-    endif()
-
-    add_library(gpu_backend SHARED ${SRC} ${CUDA_OBJ})
-    target_compile_definitions(gpu_backend PRIVATE GPU_BACKEND_EXPORTS)
-    if(NGRAPH_LIB_VERSIONING_ENABLE)
-        set_target_properties(gpu_backend PROPERTIES
-            VERSION ${NGRAPH_VERSION}
-            SOVERSION ${NGRAPH_API_VERSION})
-    endif()
-    target_link_libraries(gpu_backend PUBLIC ngraph)
-    if (NGRAPH_DEX_ONLY)
-        target_compile_definitions(gpu_backend PRIVATE "NGRAPH_DEX_ONLY")
-    else()
-        target_link_libraries(gpu_backend PUBLIC codegen)
-    endif()
-    find_library(CUDA_nvrtc_LIBRARY nvrtc
-        PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
-    find_library(CUDA_cuda_LIBRARY cuda
-        PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64 cuda/lib64/stubs)
-    find_library(CUDA_cudart_LIBRARY ${CMAKE_STATIC_LIBRARY_PREFIX}cudart_static${CMAKE_STATIC_LIBRARY_SUFFIX}
-        PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
-
-    find_package(CUDNN 7 REQUIRED)
-    target_include_directories(gpu_backend SYSTEM PUBLIC ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
-    target_link_libraries(gpu_backend
-        PUBLIC
-            ${CUDA_cuda_LIBRARY}
-            ${CUDA_nvrtc_LIBRARY}
-            ${CUDA_cudart_LIBRARY}
-            ${CUDA_LIBRARIES}
-            ${CUDA_CUBLAS_LIBRARIES}
-            ${CUDNN_LIBRARIES})
-
-    install(TARGETS gpu_backend
-        ARCHIVE DESTINATION ${NGRAPH_INSTALL_LIB}
-        LIBRARY DESTINATION ${NGRAPH_INSTALL_LIB})
-endif()
--- a/src/ngraph/runtime/gpu/cublas_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cublas_emitter.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include "ngraph/runtime/gpu/cublas_emitter.hpp"
-#include "ngraph/runtime/gpu/gpu_emitter.hpp"
-#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
-#include "ngraph/util.hpp"
-
-using namespace ngraph;
-
-runtime::gpu::CUBLASEmitter::CUBLASEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx)
-    : m_primitive_emitter(emitter)
-{
-    m_ctx = ctx;
-}
-
-size_t runtime::gpu::CUBLASEmitter::build_dot(const element::Type& dtype,
-                                              const Shape& arg0_shape,
-                                              const Shape& arg1_shape,
-                                              const Shape& out_shape,
-                                              size_t reduction_axes,
-                                              const Node* node)
-{
-    std::stringstream ss;
-    ss << "dot_op"
-       << "_dtype_" << dtype.c_type_string() << "_reduction_axes_count_" << reduction_axes;
-    std::string hash = ss.str() + "_i_" + join(arg0_shape, "_") + "_i_" + join(arg1_shape, "_");
-
-    size_t primitive_index = m_primitive_emitter->lookup(hash);
-    if (primitive_index != std::numeric_limits<size_t>::max())
-    {
-        return primitive_index;
-    }
-
-    std::unique_ptr<gpu::primitive> dot;
-    if (arg0_shape.empty() || arg1_shape.empty())
-    {
-        auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape);
-        size_t count = shape_size(second);
-
-        size_t firstIndex = (arg0_shape.empty() ? 0 : 1);
-        size_t secondIndex = (arg0_shape.empty() ? 1 : 0);
-
-        dot.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
-            CUBLAS_SAFE_CALL(cublasScopy(*m_ctx->cublas_handle,
-                                         count,
-                                         static_cast<const float*>(inputs[secondIndex]),
-                                         1,
-                                         static_cast<float*>(outputs[0]),
-                                         1));
-            CUBLAS_SAFE_CALL(cublasSscal(*m_ctx->cublas_handle,
-                                         count,
-                                         static_cast<const float*>(inputs[firstIndex]),
-                                         static_cast<float*>(outputs[0]),
-                                         1));
-            debug_sync();
-        }});
-
-        primitive_index = this->m_primitive_emitter->register_primitive(dot, hash);
-    }
-
-    // case that can be treat as dot1d
-    else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes))
-    {
-        for (int i = 0; i < arg0_shape.size(); i++)
-        {
-            if (arg0_shape[i] != arg1_shape[i])
-            {
-                std::vector<std::string> arg_vec{"arg0", "arg1"};
-                std::vector<Shape> shape_vec{arg0_shape, arg1_shape};
-                throw std::invalid_argument(get_error_string(arg_vec, shape_vec, node));
-            }
-        }
-
-        size_t count = shape_size(arg0_shape);
-        dot.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
-            CUBLAS_SAFE_CALL(cublasSdot(*m_ctx->cublas_handle,
-                                        count,
-                                        static_cast<const float*>(inputs[0]),
-                                        1,
-                                        static_cast<const float*>(inputs[1]),
-                                        1,
-                                        static_cast<float*>(outputs[0])));
-
-            debug_sync();
-        }});
-
-        primitive_index = this->m_primitive_emitter->register_primitive(dot, hash);
-    }
-
-    // matrix vector
-    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
-    {
-        dot.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
-            const float alpha = 1.0;
-            const float beta = 0;
-
-            CUBLAS_SAFE_CALL(cublasSetPointerMode(*m_ctx->cublas_handle, CUBLAS_POINTER_MODE_HOST));
-            CUBLAS_SAFE_CALL(cublasSgemv(*m_ctx->cublas_handle,
-                                         CUBLAS_OP_T,
-                                         arg0_shape[1],
-                                         arg0_shape[0],
-                                         &alpha,
-                                         static_cast<const float*>(inputs[0]),
-                                         arg0_shape[1],
-                                         static_cast<const float*>(inputs[1]),
-                                         1,
-                                         &beta,
-                                         static_cast<float*>(outputs[0]),
-                                         1));
-            CUBLAS_SAFE_CALL(
-                cublasSetPointerMode(*m_ctx->cublas_handle, CUBLAS_POINTER_MODE_DEVICE));
-
-            debug_sync();
-        }});
-
-        primitive_index = this->m_primitive_emitter->register_primitive(dot, hash);
-    }
-
-    else
-    {
-        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-        size_t axes_for_k_count = reduction_axes;
-        size_t m = 1;
-        size_t n = 1;
-        size_t k = 1;
-
-        // check if input and output size correct
-        // check and calculate k for arg0 and arg1
-        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-        size_t arg1_k_idx = 0;                // first axe in arg1 for k
-
-        for (size_t i = 0; i < axes_for_k_count; i++)
-        {
-            k *= arg0_shape[arg0_k_idx];
-            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
-            {
-                std::vector<std::string> arg_vec{"arg0", "arg1"};
-                std::vector<Shape> shape_vec{arg0_shape, arg1_shape};
-                throw std::invalid_argument(get_error_string(arg_vec, shape_vec, node));
-            }
-        }
-        // check and calculate m for arg0 and out
-        size_t arg0_m_idx = 0; // first axe in arg0 for m
-        size_t out_m_idx = 0;  // first axe in out for m
-        for (size_t i = 0; i < axes_for_m_count; i++)
-        {
-            m *= arg0_shape[arg0_m_idx];
-            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
-            {
-                std::vector<std::string> arg_vec{"arg0", "output"};
-                std::vector<Shape> shape_vec{arg0_shape, out_shape};
-                throw std::invalid_argument(get_error_string(arg_vec, shape_vec, node));
-            }
-        }
-        // check and calculate n for arg1 and out
-        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-        for (size_t i = 0; i < axes_for_n_count; i++)
-        {
-            n *= arg1_shape[arg1_n_idx];
-            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
-            {
-                std::vector<std::string> arg_vec{"arg1", "output"};
-                std::vector<Shape> shape_vec{arg1_shape, out_shape};
-                throw std::invalid_argument(get_error_string(arg_vec, shape_vec, node));
-            }
-        }
-
-        dot.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
-            const float alpha = 1.0;
-            const float beta = 0;
-
-            CUBLAS_SAFE_CALL(cublasSetPointerMode(*m_ctx->cublas_handle, CUBLAS_POINTER_MODE_HOST));
-            CUBLAS_SAFE_CALL(cublasSgemm(*m_ctx->cublas_handle,
-                                         CUBLAS_OP_N,
-                                         CUBLAS_OP_N,
-                                         n,
-                                         m,
-                                         k,
-                                         &alpha,
-                                         static_cast<const float*>(inputs[1]),
-                                         n,
-                                         static_cast<const float*>(inputs[0]),
-                                         k,
-                                         &beta,
-                                         static_cast<float*>(outputs[0]),
-                                         n));
-            CUBLAS_SAFE_CALL(
-                cublasSetPointerMode(*m_ctx->cublas_handle, CUBLAS_POINTER_MODE_DEVICE));
-
-            debug_sync();
-        }});
-        primitive_index = this->m_primitive_emitter->register_primitive(dot, hash);
-    }
-
-    return primitive_index;
-}
-
-void runtime::gpu::CUBLASEmitter::sync()
-{
-    CUDA_RT_SAFE_CALL(cudaDeviceSynchronize());
-    return;
-}
-
-void runtime::gpu::CUBLASEmitter::debug_sync()
-{
-#ifdef NGRAPH_DEBUG_ENABLE
-    CUDA_RT_SAFE_CALL(cudaDeviceSynchronize());
-#endif
-    return;
-}
-
-std::string runtime::gpu::CUBLASEmitter::get_error_string(std::vector<std::string>& arg_names,
-                                                          std::vector<Shape>& shapes,
-                                                          const Node* node)
-{
-    std::stringstream ss_err;
-    ss_err << ngraph::join(arg_names) << " with " << ngraph::join(shapes)
-           << " respectively, at Node " << node->get_name() << ", do not match for dot op";
-
-    return ss_err.str();
-}
--- a/src/ngraph/runtime/gpu/cublas_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cublas_emitter.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <cublas_v2.h>
-
-#include "ngraph/op/dot.hpp"
-#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
-#include "ngraph/shape.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            class GPUPrimitiveEmitter;
-
-            class CUBLASEmitter
-            {
-                friend class GPUPrimitiveEmitter;
-
-            public:
-                size_t build_dot(const element::Type& dtype,
-                                 const Shape& arg0_shape,
-                                 const Shape& arg1_shape,
-                                 const Shape& out_shape,
-                                 size_t reduction_axes,
-                                 const Node* node);
-
-                void debug_sync();
-                void sync();
-
-            private:
-                CUBLASEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx);
-                GPUPrimitiveEmitter* m_primitive_emitter;
-                GPURuntimeContext* m_ctx;
-                std::string get_error_string(std::vector<std::string>& arg_names,
-                                             std::vector<Shape>& shapes,
-                                             const Node* node);
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
--- a/src/ngraph/runtime/gpu/cuda_error_check.hpp
+++ b/src/ngraph/runtime/gpu/cuda_error_check.hpp
--- a/src/ngraph/runtime/gpu/cudnn_descriptors.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_descriptors.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include <cudnn.h>
-
-#include "ngraph/runtime/gpu/cuda_error_check.hpp"
-#include "ngraph/runtime/gpu/gpu_util.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            template <typename T>
-            struct cudnn_descriptor;
-
-            /// \brief A factory which builds cuDNN descriptors
-            /// and manages their creation and destruction.
-            class CUDNNDescriptors
-            {
-                class Descriptor
-                {
-                public:
-                    Descriptor() = default;
-                    virtual ~Descriptor() = default;
-                };
-
-            public:
-                CUDNNDescriptors() = default;
-                ~CUDNNDescriptors() = default;
-
-                template <typename T>
-                T& build()
-                {
-                    // A function-local class for type erased storage of
-                    // various cudnn descriptor types which is templated
-                    // over function `build`'s specialization type
-                    class descriptor_ : public Descriptor
-                    {
-                    public:
-                        descriptor_() { cudnn_descriptor<T>::create(stored_descriptor); }
-                        ~descriptor_() { cudnn_descriptor<T>::destroy(stored_descriptor); }
-                        T& get() { return stored_descriptor; }
-                    private:
-                        T stored_descriptor;
-                    };
-
-                    m_descriptors.emplace_back(new descriptor_);
-                    return static_cast<descriptor_*>(m_descriptors.back().get())->get();
-                }
-
-            private:
-                std::vector<std::unique_ptr<Descriptor>> m_descriptors;
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnLRNDescriptor_t>
-            {
-                static void create(cudnnLRNDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateLRNDescriptor(&desc));
-                }
-                static void destroy(cudnnLRNDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyLRNDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnTensorDescriptor_t>
-            {
-                static void create(cudnnTensorDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
-                }
-                static void destroy(cudnnTensorDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyTensorDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnSpatialTransformerDescriptor_t>
-            {
-                static void create(cudnnSpatialTransformerDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateSpatialTransformerDescriptor(&desc));
-                }
-                static void destroy(cudnnSpatialTransformerDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroySpatialTransformerDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnReduceTensorDescriptor_t>
-            {
-                static void create(cudnnReduceTensorDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateReduceTensorDescriptor(&desc));
-                }
-                static void destroy(cudnnReduceTensorDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyReduceTensorDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnRNNDescriptor_t>
-            {
-                static void create(cudnnRNNDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateRNNDescriptor(&desc));
-                }
-                static void destroy(cudnnRNNDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyRNNDescriptor(desc));
-                }
-            };
-
-#if CUDNN_VERSION >= 7200
-            template <>
-            struct cudnn_descriptor<cudnnRNNDataDescriptor_t>
-            {
-                static void create(cudnnRNNDataDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateRNNDataDescriptor(&desc));
-                }
-                static void destroy(cudnnRNNDataDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyRNNDataDescriptor(desc));
-                }
-            };
-#endif
-            template <>
-            struct cudnn_descriptor<cudnnPoolingDescriptor_t>
-            {
-                static void create(cudnnPoolingDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreatePoolingDescriptor(&desc));
-                }
-                static void destroy(cudnnPoolingDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyPoolingDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnOpTensorDescriptor_t>
-            {
-                static void create(cudnnOpTensorDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&desc));
-                }
-                static void destroy(cudnnOpTensorDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyOpTensorDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnFilterDescriptor_t>
-            {
-                static void create(cudnnFilterDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateFilterDescriptor(&desc));
-                }
-                static void destroy(cudnnFilterDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyFilterDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnDropoutDescriptor_t>
-            {
-                static void create(cudnnDropoutDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateDropoutDescriptor(&desc));
-                }
-                static void destroy(cudnnDropoutDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyDropoutDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnConvolutionDescriptor_t>
-            {
-                static void create(cudnnConvolutionDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateConvolutionDescriptor(&desc));
-                }
-                static void destroy(cudnnConvolutionDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyConvolutionDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnCTCLossDescriptor_t>
-            {
-                static void create(cudnnCTCLossDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateCTCLossDescriptor(&desc));
-                }
-                static void destroy(cudnnCTCLossDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyCTCLossDescriptor(desc));
-                }
-            };
-
-            template <>
-            struct cudnn_descriptor<cudnnActivationDescriptor_t>
-            {
-                static void create(cudnnActivationDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL(cudnnCreateActivationDescriptor(&desc));
-                }
-                static void destroy(cudnnActivationDescriptor_t& desc)
-                {
-                    CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyActivationDescriptor(desc));
-                }
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
--- a/src/ngraph/runtime/gpu/cudnn_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.hpp
--- a/src/ngraph/runtime/gpu/cudnn_host_parameters.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_host_parameters.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-#pragma once
-
-#include <list>
-#include <memory>
-
-#include <cudnn.h>
-
-#include "ngraph/log.hpp"
-#include "ngraph/runtime/gpu/gpu_host_parameters.hpp"
-#include "ngraph/runtime/gpu/gpu_util.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            /// \brief A factory which builds cuDNN host parameters
-            /// and manages their creation and destruction.
-            class CUDNNHostParameters
-            {
-            public:
-                CUDNNHostParameters(const std::shared_ptr<GPUHostParameters> params)
-                    : m_host_parameters(params)
-                {
-                }
-                ~CUDNNHostParameters() = default;
-
-                void* allocate_by_datatype(const cudnnDataType_t data_type, const double value)
-                {
-                    void* r = nullptr;
-                    switch (data_type)
-                    {
-                    case CUDNN_DATA_FLOAT:
-                        r = m_host_parameters->cache(static_cast<float>(value));
-                        break;
-                    case CUDNN_DATA_DOUBLE:
-                        r = m_host_parameters->cache(static_cast<double>(value));
-                        break;
-                    case CUDNN_DATA_INT8:
-                        r = m_host_parameters->cache(static_cast<int8_t>(value));
-                        break;
-                    case CUDNN_DATA_INT32:
-                        r = m_host_parameters->cache(static_cast<int32_t>(value));
-                        break;
-                    case CUDNN_DATA_HALF:
-                    case CUDNN_DATA_INT8x4:
-                    case CUDNN_DATA_UINT8:
-                    case CUDNN_DATA_UINT8x4:
-                    default:
-                        throw std::runtime_error(
-                            "Encountered unhandled cudnnDataType_t during compilation.");
-                    }
-                    return r;
-                }
-
-            private:
-                std::shared_ptr<GPUHostParameters> m_host_parameters;
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/cudnn_invoke.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_invoke.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <string>
-
-#include "cudnn_invoke.hpp"
-#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
-
-extern "C" void ngraph::runtime::gpu::cudnn_utils::cudnn_invoke_primitive(GPURuntimeContext* ctx,
-                                                                          size_t primitive_index,
-                                                                          void** args,
-                                                                          void** result)
-{
-    (*ctx->cudnn_primitives[primitive_index])(args, result);
-}
--- a/src/ngraph/runtime/gpu/cudnn_invoke.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_invoke.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <cstddef>
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            struct GPURuntimeContext;
-
-            namespace cudnn_utils
-            {
-                extern "C" void cudnn_invoke_primitive(GPURuntimeContext* ctx,
-                                                       size_t primitive_index,
-                                                       void** args,
-                                                       void** result);
-            }
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cudnn.h>
-
-#include "ngraph/graph_util.hpp"
-#include "ngraph/op/batch_norm.hpp"
-#include "ngraph/runtime/backend_manager.hpp"
-#include "ngraph/runtime/gpu/gpu_backend.hpp"
-#include "ngraph/runtime/gpu/gpu_external_function.hpp"
-#include "ngraph/runtime/gpu/gpu_internal_function.hpp"
-#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
-#include "ngraph/runtime/gpu/gpu_tensor.hpp"
-#include "ngraph/runtime/gpu/gpu_util.hpp"
-#include "ngraph/util.hpp"
-
-using namespace ngraph;
-using namespace std;
-
-extern "C" runtime::BackendConstructor* get_backend_constructor_pointer()
-{
-    class LocalBackendConstructor : public runtime::BackendConstructor
-    {
-    public:
-        std::shared_ptr<runtime::Backend> create(const std::string& config) override
-        {
-            return std::make_shared<runtime::gpu::GPU_Backend>();
-        }
-    };
-
-    static unique_ptr<runtime::BackendConstructor> s_backend_constructor(
-        new LocalBackendConstructor());
-    return s_backend_constructor.get();
-}
-
-runtime::gpu::GPU_Backend::GPU_Backend()
-    : runtime::Backend()
-{
-}
-
-runtime::gpu::GPU_Backend::BackendContext::BackendContext()
-    : m_runtime_context(new GPURuntimeContext)
-    , m_primitive_emitter(new GPUPrimitiveEmitter(m_runtime_context))
-    , m_cuda_manager(new CudaContextManager)
-{
-    // Create context use driver API and make it current, the runtime call will pickup the context
-    // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
-    // #interoperability-between-runtime-and-driver-apis
-    bind_cuda_context_to_thread();
-
-    m_runtime_context->cublas_handle = new cublasHandle_t;
-    cublasStatus_t cublasStatus = cublasCreate(m_runtime_context->cublas_handle);
-    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
-    {
-        throw runtime_error("cuBLAS create handle failed");
-    }
-    // Pass scalars as reference on the Device
-    cublasSetPointerMode(*m_runtime_context->cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
-
-    m_runtime_context->cudnn_handle = new cudnnHandle_t;
-    cudnnStatus_t cudnnStatus = cudnnCreate(m_runtime_context->cudnn_handle);
-    if (cudnnStatus != CUDNN_STATUS_SUCCESS)
-    {
-        throw runtime_error("cuDNN create handle failed");
-    }
-
-    // register with c-api runtime context
-    m_runtime_context->compiled_kernel_pool = new CudaFunctionPool;
-}
-
-void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context()
-{
-    // set context current each time in case thread changed
-    bind_cuda_context_to_thread();
-    // add pointers to gpu primitives into the gpu runtime context
-    m_runtime_context->gpu_primitives = m_primitive_emitter->get_primitives().data();
-    m_runtime_context->gpu_memory_primitives = m_primitive_emitter->get_memory_primitives().data();
-}
-
-void runtime::gpu::GPU_Backend::BackendContext::bind_cuda_context_to_thread()
-{
-    m_cuda_manager->SetContextCurrent();
-}
-
-runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
-{
-    cublasDestroy(*m_runtime_context->cublas_handle);
-    delete m_runtime_context->cublas_handle;
-    cudnnDestroy(*m_runtime_context->cudnn_handle);
-    delete m_runtime_context->cudnn_handle;
-    delete m_runtime_context->compiled_kernel_pool;
-}
-
-shared_ptr<runtime::Tensor>
-    runtime::gpu::GPU_Backend::create_tensor(const element::Type& element_type, const Shape& shape)
-{
-    return make_shared<runtime::gpu::GPUTensor>(element_type, shape);
-}
-
-shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor(
-    const element::Type& element_type, const Shape& shape, void* memory_pointer)
-{
-    if (memory_pointer != nullptr && !is_device_pointer(memory_pointer))
-    {
-        throw ngraph_error("The pointer passed to create_tensor is not a device pointer.");
-    }
-    return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer);
-}
-
-shared_ptr<runtime::Executable> runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func,
-                                                                   bool timing_enable)
-{
-    shared_ptr<runtime::Executable> rc;
-    auto it = m_exec_map.find(func);
-    if (it != m_exec_map.end())
-    {
-        rc = it->second;
-    }
-    else
-    {
-        rc = make_shared<GPU_Executable>(func, timing_enable);
-        m_exec_map.insert({func, rc});
-    }
-    return rc;
-}
-
-runtime::gpu::GPU_Executable::GPU_Executable(shared_ptr<Function> func, bool enable_timing)
-    : m_context(new GPU_Backend::BackendContext())
-
-{
-    FunctionInstance& instance = m_function_instance;
-    if (instance.m_compiled_function == nullptr)
-    {
-        m_context->bind_cuda_context_to_thread();
-        instance.m_compiled_function = runtime::gpu::GPUCompiledFunction::make(func, m_context);
-        instance.m_compiled_function->m_emit_timing = enable_timing;
-        instance.m_compiled_function->compile();
-        instance.m_runtime = instance.m_compiled_function->m_runtime;
-        instance.m_inputs.resize(func->get_parameters().size());
-        instance.m_outputs.resize(func->get_output_size());
-    }
-    set_parameters_and_results(*func);
-}
-
-void runtime::gpu::GPU_Executable::initialize_io(void** target,
-                                                 const vector<shared_ptr<runtime::Tensor>>& source)
-{
-    for (size_t i = 0; i < source.size(); i++)
-    {
-        shared_ptr<runtime::gpu::GPUTensor> tv =
-            dynamic_pointer_cast<runtime::gpu::GPUTensor>(source[i]);
-        if (tv)
-        {
-            target[i] = tv->m_allocated_buffer_pool;
-        }
-        else
-        {
-            throw invalid_argument("Tensors passed to GPU backend must be GPU Tensors");
-        }
-    }
-}
-
-bool runtime::gpu::GPU_Executable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
-                                        const vector<shared_ptr<runtime::Tensor>>& inputs)
-{
-    FunctionInstance& instance = m_function_instance;
-    if (instance.m_compiled_function == nullptr)
-    {
-        throw runtime_error("compile() must be called before call().");
-    }
-
-    // ensure the GPURuntimeContext primitive pointers are valid
-    m_context->prepare_runtime_context();
-
-    // Device tensors
-    initialize_io(instance.m_inputs.data(), inputs);
-    initialize_io(instance.m_outputs.data(), outputs);
-
-    auto ctx = m_context->m_runtime_context.get();
-    instance.m_runtime(instance.m_inputs.data(), instance.m_outputs.data(), ctx);
-
-    return true;
-}
-
-// void runtime::gpu::GPU_Backend::remove_compiled_function(shared_ptr<Function> func)
-// {
-//     m_function_map.erase(func);
-// }
-
-vector<runtime::PerformanceCounter> runtime::gpu::GPU_Executable::get_performance_data() const
-{
-    std::vector<runtime::PerformanceCounter> rc;
-    const FunctionInstance& instance = m_function_instance;
-    if (instance.m_compiled_function != nullptr)
-    {
-        instance.m_compiled_function->get_performance_data(rc);
-    }
-    return rc;
-}
-
-bool runtime::gpu::GPU_Backend::is_supported(const Node& op) const
-{
-    set<string> unsupported_ops = {"Quantize",
-                                   "Dequantize",
-                                   "DynReplaceSlice",
-                                   "DynReshape",
-                                   "DynSlice",
-                                   "ShapeOf",
-                                   "All",
-                                   "Any",
-                                   "AllReduce",
-                                   "BatchMatMul",
-                                   "DynPad"
-                                   "SelectAndScatter",
-                                   "StopGradient",
-                                   "EmbeddingLookup",
-                                   "GenerateMask",
-                                   "DynBroadcast",
-                                   "Transpose",
-                                   "Range",
-                                   "Recv",
-                                   "Send"};
-
-    set<string> float_only = {"MaxPoolBackprop", "AvgPoolBackprop", "MaxPool", "Dot"};
-
-    if (unsupported_ops.find(op.description()) != unsupported_ops.end())
-    {
-        return false;
-    }
-
-    if (float_only.find(op.description()) != float_only.end())
-    {
-        if (op.get_output_element_type(0) != element::f32 &&
-            op.get_output_element_type(0) != element::f64)
-        {
-            return false;
-        }
-    }
-
-    if (op.description() == "BatchNormInference")
-    {
-        const ngraph::op::BatchNormInference* bn =
-            static_cast<const ngraph::op::BatchNormInference*>(&op);
-        if (bn->get_eps_value() < CUDNN_BN_MIN_EPSILON)
-        {
-            return false;
-        }
-    }
-    else if (op.description() == "BatchNormTraining")
-    {
-        const ngraph::op::BatchNormTraining* bn =
-            static_cast<const ngraph::op::BatchNormTraining*>(&op);
-        if (bn->get_eps_value() < CUDNN_BN_MIN_EPSILON)
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
--- a/src/ngraph/runtime/gpu/gpu_backend.hpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <map>
-#include <memory>
-
-#include "ngraph/runtime/backend.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            static size_t alignment = 64;
-
-            class GPUCompiledFunction;
-            class GPUPrimitiveEmitter;
-            struct GPURuntimeContext;
-            class CudaContextManager;
-
-            using EntryPoint_t = void(void** inputs, void** outputs, GPURuntimeContext* ctx);
-            using EntryPoint = std::function<EntryPoint_t>;
-
-            class GPU_Backend : public Backend
-            {
-            public:
-                GPU_Backend();
-
-                std::shared_ptr<ngraph::runtime::Tensor>
-                    create_tensor(const ngraph::element::Type& element_type,
-                                  const Shape& shape,
-                                  void* memory_pointer) override;
-
-                std::shared_ptr<ngraph::runtime::Tensor>
-                    create_tensor(const ngraph::element::Type& element_type,
-                                  const Shape& shape) override;
-
-                std::shared_ptr<runtime::Executable> compile(std::shared_ptr<Function> func,
-                                                             bool timing_enabled = false) override;
-
-                bool is_supported(const Node& node) const override;
-
-                class BackendContext
-                {
-                public:
-                    BackendContext();
-                    ~BackendContext();
-                    void prepare_runtime_context();
-                    void bind_cuda_context_to_thread();
-
-                    std::unique_ptr<GPURuntimeContext> m_runtime_context;
-                    std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
-
-                private:
-                    std::unique_ptr<CudaContextManager> m_cuda_manager;
-                };
-
-            private:
-                std::map<std::shared_ptr<Function>, std::shared_ptr<Executable>> m_exec_map;
-            };
-
-            class GPU_Executable : public Executable
-            {
-            public:
-                GPU_Executable(std::shared_ptr<Function> func, bool enable_timing);
-
-                bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
-                          const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
-
-                // void remove_compiled_function(std::shared_ptr<Function> func) override;
-                std::vector<PerformanceCounter> get_performance_data() const override;
-
-            private:
-                class FunctionInstance
-                {
-                public:
-                    std::shared_ptr<GPUCompiledFunction> m_compiled_function;
-                    bool m_performance_counters_enabled = false;
-                    EntryPoint m_runtime;
-                    std::vector<void*> m_inputs;
-                    std::vector<void*> m_outputs;
-                } m_function_instance;
-
-                /// \brief Convert a vector of Tensor into a vector of void* where each void*
-                /// points to a Tensor's data buffer.
-                /// \param target Pointer to a pre-allocated array of void* with
-                /// size >= source.size()
-                /// \param source Source vector of Tensors
-                static void
-                    initialize_io(void** target,
-                                  const std::vector<std::shared_ptr<runtime::Tensor>>& source);
-
-                std::shared_ptr<GPU_Backend::BackendContext> m_context;
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/gpu_call_frame.cpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
-
-using namespace ngraph;
-
-runtime::gpu::GPUCallFrame::GPUCallFrame(const size_t& num_inputs, const size_t& num_outputs)
-    : m_inputs(num_inputs, nullptr)
-    , m_outputs(num_outputs, nullptr)
-{
-}
-
-void runtime::gpu::GPUCallFrame::resolve_reservations(
-    const GPUCompiledFunction* compiled_function,
-    const std::unordered_map<std::string, size_t>& memory_reservations)
-{
-    auto& mem_primitives = compiled_function->get_primitive_emitter()->get_memory_primitives();
-    for (auto const& p : memory_reservations)
-    {
-        // mem_primitives may return pointers for constant or workspace reservations
-        m_memory_reservations[p.first] = static_cast<unsigned char*>(mem_primitives.at(p.second)());
-    }
-}
-
-void runtime::gpu::GPUCallFrame::resolve_inputs(void** inputs, size_t num_inputs)
-{
-    // num_inputs is > 0 iff we are resolving inputs from a nested function call
-    if (num_inputs == 0)
-    {
-        num_inputs = m_inputs.size();
-    }
-    for (size_t i = 0; i < num_inputs; i++)
-    {
-        void* input = inputs[i];
-        m_inputs[i] = static_cast<unsigned char*>(input);
-    }
-}
-
-void runtime::gpu::GPUCallFrame::resolve_outputs(void** outputs, size_t num_outputs)
-{
-    // num_outputs is > 0 iff we are resolving outputs from a nested function call
-    if (num_outputs == 0)
-    {
-        num_outputs = m_outputs.size();
-    }
-    for (size_t i = 0; i < num_outputs; i++)
-    {
-        void* output = outputs[i];
-        m_outputs[i] = static_cast<unsigned char*>(output);
-    }
-}
-
-// returns pointers of any TensorRole
-std::vector<void*>
-    runtime::gpu::GPUCallFrame::get_tensor_io(const std::vector<GPUTensorWrapper>& tensors)
-{
-    std::vector<void*> ptrs;
-    for (auto const& tensor : tensors)
-    {
-        auto offset = tensor.get_offset();
-        auto ptr = get_pointer(offset.first, offset.second, tensor.get_name());
-        ptrs.push_back(ptr);
-    }
-    return ptrs;
-}
-
-void* runtime::gpu::GPUCallFrame::get_pointer(const TensorRole& type,
-                                              const size_t& offset,
-                                              const std::string& name)
-{
-    switch (type)
-    {
-    case TensorRole::CONSTANT:
-    case TensorRole::INTERMEDIATE:
-        return static_cast<void*>(m_memory_reservations.at(name) + offset);
-    case TensorRole::INPUT: return static_cast<void*>(m_inputs.at(offset));
-    case TensorRole::OUTPUT: return static_cast<void*>(m_outputs.at(offset));
-    case TensorRole::UNKNOWN:
-    default: throw ngraph_error("GPUCallFrame encountered unknown or uninitialized tensor type");
-    };
-}
--- a/src/ngraph/runtime/gpu/gpu_call_frame.hpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <unordered_map>
-
-#include "ngraph/runtime/gpu/gpu_compiled_function.hpp"
-#include "ngraph/runtime/gpu/gpu_tensor_wrapper.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            class GPUCallFrame
-            {
-            public:
-                GPUCallFrame(const size_t& num_inputs, const size_t& num_outputs);
-                void resolve_reservations(
-                    const GPUCompiledFunction* compiled_function,
-                    const std::unordered_map<std::string, size_t>& memory_reservations);
-                void resolve_inputs(void** inputs, size_t num_inputs = 0);
-                void resolve_outputs(void** outputs, size_t num_outputs = 0);
-                std::vector<void*> get_tensor_io(const std::vector<GPUTensorWrapper>& tensors);
-
-            private:
-                void* get_pointer(const TensorRole& type,
-                                  const size_t& offset,
-                                  const std::string& name = "");
-
-                std::unordered_map<std::string, unsigned char*> m_memory_reservations;
-                std::vector<unsigned char*> m_inputs;
-                std::vector<unsigned char*> m_outputs;
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/gpu_compiled_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_compiled_function.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <algorithm>
-#include <cstdlib>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cudnn.h>
-#include <fstream>
-#include <locale>
-#include <mutex>
-#include <string>
-#include <tuple>
-
-#include "ngraph/descriptor/input.hpp"
-#include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
-#include "ngraph/descriptor/output.hpp"
-#include "ngraph/file_util.hpp"
-#include "ngraph/function.hpp"
-#include "ngraph/node.hpp"
-#include "ngraph/pass/algebraic_simplification.hpp"
-#include "ngraph/pass/fused_op_decomposition.hpp"
-#include "ngraph/pass/get_output_element_elimination.hpp"
-#include "ngraph/pass/implicit_broadcast_elimination.hpp"
-#include "ngraph/pass/like_replacement.hpp"
-
-#include "ngraph/runtime/gpu/gpu_backend.hpp"
-#include "ngraph/runtime/gpu/gpu_compiled_function.hpp"
-#include "ngraph/runtime/gpu/gpu_external_function.hpp"
-#include "ngraph/runtime/gpu/gpu_internal_function.hpp"
-#include "ngraph/runtime/gpu/op/batch_norm.hpp"
-#include "ngraph/runtime/gpu/op/rnn.hpp"
-#include "ngraph/runtime/gpu/pass/gpu_batch_norm_cache.hpp"
-#include "ngraph/runtime/gpu/pass/gpu_layout.hpp"
-#include "ngraph/runtime/gpu/pass/gpu_rnn_fusion.hpp"
-#include "ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp"
-
-using namespace std;
-using namespace ngraph;
-
-std::string runtime::gpu::GPUCompiledFunction::get_output_dir()
-{
-    static std::string output_dir = "gpu_codegen";
-    return output_dir;
-}
-
-size_t runtime::gpu::GPUCompiledFunction::get_memory_alignment()
-{
-    static size_t memory_pool_alignment = 64;
-    return memory_pool_alignment;
-}
-
-static std::mutex s_compilation;
-
-class GPUStaticInitializers
-{
-public:
-    GPUStaticInitializers()
-    {
-        file_util::remove_directory(runtime::gpu::GPUCompiledFunction::get_output_dir());
-        file_util::make_directory(runtime::gpu::GPUCompiledFunction::get_output_dir());
-    }
-};
-
-static GPUStaticInitializers s_static_initializers;
-
-runtime::gpu::GPUCompiledFunction::GPUCompiledFunction(
-    const shared_ptr<ngraph::Function>& function,
-    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
-    : m_runtime(nullptr)
-    , m_function(function)
-    , m_emit_timing(false)
-    , m_is_compiled(false)
-    , m_shared_context(shared_context)
-{
-}
-
-runtime::gpu::GPUCompiledFunction::~GPUCompiledFunction()
-{
-}
-
-std::vector<std::string> get_case_variants(std::vector<std::string> cases)
-{
-    std::vector<std::string> results;
-    for (auto& c : cases)
-    {
-        results.push_back(c);
-        if (std::all_of(c.begin(), c.end(), ::isdigit))
-        {
-            continue;
-        }
-        for (auto i = 0u; i < c.size(); i++)
-        {
-            c[i] = std::toupper(c[i], std::locale());
-            if (i == 0)
-            {
-                results.emplace_back(c);
-            }
-        }
-        results.emplace_back(c);
-    }
-    return results;
-}
-
-std::shared_ptr<runtime::gpu::GPUCompiledFunction> runtime::gpu::GPUCompiledFunction::make(
-    const std::shared_ptr<ngraph::Function>& function,
-    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
-{
-#if defined(NGRAPH_DEX_ONLY)
-    return std::make_shared<runtime::gpu::GPUInternalFunction>(function, shared_context);
-#else
-    // For now codegen is default unless explicitly disabled
-    bool use_codegen = true;
-    if (auto env = std::getenv("NGRAPH_CODEGEN"))
-    {
-        std::string env_codegen(env);
-        for (auto& opt : get_case_variants({"0", "false"}))
-        {
-            if (env_codegen == opt)
-            {
-                use_codegen = false;
-            }
-        }
-    }
-    if (use_codegen)
-    {
-        return std::make_shared<runtime::gpu::GPUExternalFunction>(function, shared_context);
-    }
-    else
-    {
-        return std::make_shared<runtime::gpu::GPUInternalFunction>(function, shared_context);
-    }
-#endif
-}
-
-void runtime::gpu::GPUCompiledFunction::compile()
-{
-    if (m_is_compiled)
-    {
-        return;
-    }
-    std::unique_lock<std::mutex> lock(s_compilation);
-
-    m_function_name = m_function->get_name();
-
-    auto allocator = std::make_shared<runtime::gpu::GPUAllocator>(
-        m_shared_context->m_primitive_emitter->get_memory_allocator());
-
-    ngraph::pass::Manager pass_manager;
-#if CUDNN_VERSION >= 7200
-    // recurrent network fusion
-    pass_manager.register_pass<runtime::gpu::pass::LSTMFusion>();
-    pass_manager.register_pass<runtime::gpu::pass::RNNFusion>();
-    pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
-    pass_manager.register_pass<runtime::gpu::pass::MultiLayerRNNFusion>();
-#else
-    pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
-#endif
-    pass_manager.register_pass<runtime::gpu::pass::BatchNormCache>();
-    pass_manager.register_pass<ngraph::pass::LikeReplacement>();
-    pass_manager.register_pass<ngraph::pass::FusedOpDecomposition>();
-    pass_manager.register_pass<ngraph::pass::ImplicitBroadcastElimination>();
-    pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
-    pass_manager.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
-    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
-    pass_manager.register_pass<ngraph::pass::Liveness>();
-    pass_manager.register_pass<ngraph::pass::MemoryLayout>(get_memory_alignment());
-    pass_manager.register_pass<runtime::gpu::pass::TensorMemoryReservation>(
-        *allocator, m_tensor_memory_buffers);
-    string dump_filename = file_util::path_join(get_output_dir(), m_function_name + "_ops.txt");
-    pass_manager.register_pass<ngraph::pass::DumpSorted>(dump_filename);
-    pass_manager.run_passes(m_function);
-    m_function_ordered_ops.emplace(m_function, m_function->get_ordered_ops());
-
-    add_passes(pass_manager);
-    emit();
-
-    // allocate device buffers for primitive arguments and workspace
-    allocator->close();
-    m_shared_context->m_primitive_emitter->allocate_primitive_memory();
-
-    compile_function();
-    m_is_compiled = true;
-}
--- a/src/ngraph/runtime/gpu/gpu_compiled_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_compiled_function.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>
-
-#include "ngraph/function.hpp"
-#include "ngraph/pass/assign_layout.hpp"
-#include "ngraph/pass/dump_sorted.hpp"
-#include "ngraph/pass/liveness.hpp"
-#include "ngraph/pass/manager.hpp"
-#include "ngraph/pass/memory_layout.hpp"
-#include "ngraph/runtime/gpu/gpu_backend.hpp"
-#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
-#include "ngraph/runtime/gpu/gpu_tensor_wrapper.hpp"
-
-#define EMIT_ARGS                                                                                  \
-    runtime::gpu::GPUCompiledFunction *compiled_function, const std::string &function_name,        \
-        const Node *node, const std::vector<runtime::gpu::GPUTensorWrapper> &args,                 \
-        const std::vector<runtime::gpu::GPUTensorWrapper> &out
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            class GPU_Emitter;
-            struct GPURuntimeContext;
-
-            class GPUCompiledFunction
-            {
-                friend class GPU_Backend;
-                friend class GPU_Executable;
-
-            public:
-                GPUCompiledFunction(
-                    const std::shared_ptr<ngraph::Function>& function,
-                    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
-                virtual ~GPUCompiledFunction();
-
-                static std::shared_ptr<GPUCompiledFunction>
-                    make(const std::shared_ptr<ngraph::Function>& function,
-                         const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
-                std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
-                const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
-                {
-                    return m_shared_context->m_primitive_emitter;
-                }
-                virtual std::string
-                    add_to_runtime(size_t primitive_index,
-                                   const std::string& function_name,
-                                   const std::vector<runtime::gpu::GPUTensorWrapper>& args,
-                                   const std::vector<runtime::gpu::GPUTensorWrapper>& out) = 0;
-                virtual std::string
-                    add_call_to_runtime(const std::string& caller,
-                                        const std::string& callee,
-                                        const std::vector<runtime::gpu::GPUTensorWrapper>& args,
-                                        const std::vector<runtime::gpu::GPUTensorWrapper>& out) = 0;
-                void compile();
-
-                virtual void
-                    get_performance_data(std::vector<runtime::PerformanceCounter>& rc) const = 0;
-
-                static size_t get_memory_alignment();
-                static std::string get_output_dir();
-
-            protected:
-                virtual void compile_function() = 0;
-                virtual void add_passes(ngraph::pass::Manager& pass_manager) = 0;
-                virtual void emit() = 0;
-
-                EntryPoint m_runtime;
-
-                // For non-destructive passthrough kernels, propagate function
-                // input buffers to internal ops
-                virtual void propagate_in_place_input(ngraph::descriptor::Output* output,
-                                                      const std::string& input_name) = 0;
-                // For in-place kernels, propagate function output buffers to
-                // internal ops
-                virtual void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
-                                                       const std::string& output_name) = 0;
-                std::shared_ptr<ngraph::Function> m_function;
-
-                std::unordered_map<std::shared_ptr<Function>, std::list<std::shared_ptr<Node>>>
-                    m_function_ordered_ops;
-
-                bool m_emit_timing;
-                bool m_is_compiled;
-                size_t m_offset;
-
-                std::string m_function_name;
-
-                std::unordered_map<std::string, size_t> m_tensor_memory_buffers;
-                std::shared_ptr<GPU_Backend::BackendContext> m_shared_context;
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <memory>
-#include <string>
-
-#include "ngraph/runtime/gpu/cuda_error_check.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
-
-using namespace ngraph;
-
-runtime::gpu::CudaContextManager::CudaContextManager()
-{
-    CUDA_SAFE_CALL(cuInit(0));
-    CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0));
-    CUDA_SAFE_CALL(cuDevicePrimaryCtxRetain(&m_context, m_device));
-}
-
-runtime::gpu::CudaContextManager::~CudaContextManager()
-{
-    CUDA_SAFE_CALL_NO_THROW(cuDevicePrimaryCtxRelease(m_device));
-}
--- a/src/ngraph/runtime/gpu/gpu_cuda_context_manager.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_context_manager.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <cuda.h>
-#include <memory>
-#include <string>
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            class CudaContextManager
-            {
-            public:
-                CudaContextManager();
-                ~CudaContextManager();
-
-                CudaContextManager(CudaContextManager const&) = delete;
-                CudaContextManager(CudaContextManager&&) = delete;
-                CudaContextManager& operator=(CudaContextManager const&) = delete;
-                CudaContextManager& operator=(CudaContextManager&&) = delete;
-
-                CUcontext GetContext() { return m_context; }
-                void SetContextCurrent() { cuCtxSetCurrent(m_context); }
-            protected:
-                CUdevice m_device;
-                CUcontext m_context;
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <cstring>
-#include <iostream>
-#include <string>
-
-#include "ngraph/runtime/gpu/cuda_error_check.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
-#include "ngraph/runtime/gpu/gpu_util.hpp"
-
-using namespace ngraph;
-
-std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::string& name,
-                                                                   const std::string& kernel,
-                                                                   int number_of_options,
-                                                                   const char** options)
-{
-    nvrtcProgram prog;
-    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
-                                       kernel.c_str(),
-                                       "ngraph.cu",
-                                       0,      // numHeaders
-                                       NULL,   // headers
-                                       NULL)); // includeNames
-
-    nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);
-
-    // output compiler log helper
-    auto emit_log = [&prog]() {
-        size_t logSize;
-        NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
-        char* log = static_cast<char*>(malloc(sizeof(char) * logSize + 1));
-        NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
-        log[logSize] = '\x0';
-        if (std::strlen(log) >= 2)
-        {
-            std::cerr << log;
-        }
-        free(log);
-    };
-
-    // throw if compilation was not successful
-    if (compile_result != NVRTC_SUCCESS)
-    {
-        std::cerr << "Compile error: \n" + kernel;
-        // output compiler errors
-        emit_log();
-        throw std::runtime_error("NVRTC compilation failure.");
-    }
-    // output any compiler warnings
-    emit_log();
-
-    // retrieve the intermediate PTX
-    size_t ptx_size;
-    NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
-    char* ptx = new char[ptx_size];
-    NVRTC_SAFE_CALL(
-        nvrtcGetPTX(prog,
-                    ptx)); // Load the generated PTX and get a handle to the parent kernel.
-    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
-
-    // extract the compiled function
-    CUmodule module;
-    CUfunction function;
-    CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, nullptr, nullptr));
-    CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str()));
-    return std::make_shared<CUfunction>(function);
-}
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_builder.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <string>
-
-#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
-#include "ngraph/runtime/gpu/gpu_util.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            class CudaFunctionBuilder
-            {
-            public:
-                static std::shared_ptr<CUfunction> get(const std::string& name,
-                                                       const std::string& kernel,
-                                                       int number_of_options,
-                                                       const char** options);
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_pool.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_pool.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <cctype>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/file_util.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
-
-static const std::string s_output_dir = "gpu_codegen";
-
-using namespace ngraph;
-
-std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionPool::set(const std::string& name,
-                                                                const std::string& kernel)
-{
-    const char* opts[] = {"--gpu-architecture=compute_35", "--relocatable-device-code=true"};
-    std::string filename =
-        file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu");
-    std::ofstream out(filename);
-    out << kernel;
-    out.close();
-    auto cu_compiled_function = CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts);
-    m_function_map.insert({name, cu_compiled_function});
-    return cu_compiled_function;
-}
-
-std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionPool::get(const std::string& name)
-{
-    auto it = m_function_map.find(name);
-    if (it != m_function_map.end())
-    {
-        return (*it).second;
-    }
-    return nullptr;
-}
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_pool.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_pool.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            class CudaFunctionPool
-            {
-            public:
-                CudaFunctionPool() {}
-                ~CudaFunctionPool() {}
-                std::shared_ptr<CUfunction> set(const std::string& name, const std::string& kernel);
-                std::shared_ptr<CUfunction> get(const std::string& name);
-
-            private:
-                std::unordered_map<std::string, std::shared_ptr<CUfunction>> m_function_map;
-            };
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "ngraph/node.hpp"
-#include "ngraph/runtime/gpu/gpu_compiled_function.hpp"
-#include "ngraph/runtime/gpu/gpu_tensor_wrapper.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            class GPU_Emitter
-            {
-            public:
-                static std::function<std::string(EMIT_ARGS)> get_emit_function(const Node& node);
-
-// This defines a collection of function declarations like this
-// static std::string emit_Abs(EMIT_ARGS);
-// static std::string emit_Acos(EMIT_ARGS);
-#define NGRAPH_OP(a, b) static std::string emit_##a(EMIT_ARGS);
-#include "ngraph/runtime/gpu/op/op_tbl.hpp"
-#undef NGRAPH_OP
-
-                template <typename T>
-                static std::string emit_elementwise(EMIT_ARGS)
-                {
-                    if (out[0].get_size() == 0)
-                    {
-                        return "";
-                    }
-                    else if (out.size() > 1)
-                    {
-                        throw std::runtime_error(
-                            "Multi-output elementwise ops are not currently supported.");
-                    }
-                    auto& cuda_emitter =
-                        compiled_function->get_primitive_emitter()->get_cuda_emitter();
-
-                    std::vector<std::string> dtypes;
-                    for (auto& arg : args)
-                    {
-                        dtypes.push_back(arg.get_type());
-                    }
-                    // Special case for bool data type.
-                    if (out[0].get_element_type() == element::boolean)
-                    {
-                        dtypes.push_back("bool");
-                    }
-                    else
-                    {
-                        dtypes.push_back(out[0].get_type());
-                    }
-                    auto ew_index = cuda_emitter->build_elementwise<T>(dtypes, out[0].get_shape());
-
-                    return compiled_function->add_to_runtime(ew_index, function_name, args, out);
-                }
-
-                static std::string emit_ArgReduce(EMIT_ARGS, cudnnReduceTensorOp_t);
-                static std::string emit_Sum_0(EMIT_ARGS);
-                static std::string emit_Sum_1(EMIT_ARGS);
-
-                /// \brief Create a list of node names for each arg in args
-                /// \param args list of tensor arguments
-                /// \param arg_indexes a list of indexes into args for which args to include in
-                ///    the output list, so {1, 2} will include args 1 and 2 and skip 0.
-                /// \ return returns a string containing "arg0_name, arg1_name, etc."
-                static std::string node_names(const std::vector<GPUTensorWrapper>& args,
-                                              std::initializer_list<int> arg_indexes = {});
-            };
-
-            Shape get_padded_shape(const Shape& input_shape,
-                                   const Shape& padding_below,
-                                   const Shape& padding_above,
-                                   const Shape& padding_interior);
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#if !defined(NGRAPH_DEX_ONLY)
-
-#include <functional>
-#include <memory>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>
-
-#include "ngraph/code_writer.hpp"
-#include "ngraph/codegen/compiler.hpp"
-#include "ngraph/codegen/execution_engine.hpp"
-#include "ngraph/function.hpp"
-#include "ngraph/pass/assign_layout.hpp"
-#include "ngraph/pass/dump_sorted.hpp"
-#include "ngraph/pass/liveness.hpp"
-#include "ngraph/pass/manager.hpp"
-#include "ngraph/pass/memory_layout.hpp"
-#include "ngraph/runtime/gpu/gpu_backend.hpp"
-#include "ngraph/runtime/gpu/gpu_compiled_function.hpp"
-#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
-#include "ngraph/runtime/gpu/gpu_tensor_wrapper.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            class GPU_Emitter;
-            struct GPURuntimeContext;
-
-            class GPUExternalFunction : public GPUCompiledFunction
-            {
-            public:
-                GPUExternalFunction(
-                    const std::shared_ptr<ngraph::Function>& function,
-                    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
-                virtual ~GPUExternalFunction();
-
-                virtual std::string
-                    add_to_runtime(size_t primitive_index,
-                                   const std::string& function_name,
-                                   const std::vector<runtime::gpu::GPUTensorWrapper>& args,
-                                   const std::vector<runtime::gpu::GPUTensorWrapper>& out) override;
-                virtual std::string add_call_to_runtime(
-                    const std::string& caller,
-                    const std::string& callee,
-                    const std::vector<runtime::gpu::GPUTensorWrapper>& args,
-                    const std::vector<runtime::gpu::GPUTensorWrapper>& out) override;
-                virtual void get_performance_data(
-                    std::vector<runtime::PerformanceCounter>& rc) const override;
-
-            protected:
-                virtual void compile_function() override;
-                virtual void add_passes(ngraph::pass::Manager& pass_manager) override;
-                virtual void emit() override;
-
-            private:
-                /// \brief Create a list of node names for each arg in args
-                /// \param args list of tensor arguments
-                /// \param arg_indexes a list of indexes into args for which args to include in
-                ///    the output list, so {1, 2} will include args 1 and 2 and skip 0.
-                /// \ return returns a string containing "arg0_name, arg1_name, etc."
-                std::string node_names(const std::vector<runtime::gpu::GPUTensorWrapper>& args,
-                                       std::initializer_list<int> arg_indexes = {});
-
-                void emit_header();
-                void emit_timer_functions();
-                void emit_constant_declarations();
-                void emit_function_declarations();
-                void emit_functions();
-                void emit_debug_function_entry(Node* node);
-                void emit_debug_function_exit(Node* node);
-                void emit_temp_mem_pool_allocation(std::shared_ptr<Function> current_function);
-                void store_emitted_functions(const std::string& code);
-                std::string emit_op(EMIT_ARGS);
-                std::string emit_op_as_function(const Node& node, const std::string& function_name);
-                std::string strip_comments(const std::string& s) const;
-
-                static const std::string& get_pch_header_source();
-                static const std::string& get_header_source();
-
-                // For non-destructive passthrough kernels, propagate function
-                // input buffers to internal ops
-                virtual void propagate_in_place_input(ngraph::descriptor::Output* output,
-                                                      const std::string& input_name) override;
-                // For in-place kernels, propagate function output buffers to
-                // internal ops
-                virtual void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
-                                                       const std::string& output_name) override;
-                CodeWriter m_writer;
-                std::string m_common_function_string;
-                std::unique_ptr<codegen::Compiler> m_compiler;
-                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
-                std::map<std::string, size_t> m_name_index_map;
-                std::unordered_map<std::string, std::string> m_variable_name_map;
-                std::unordered_map<Node*, Node*> m_node_function_map;
-            };
-        }
-    }
-}
-#endif // !defined(NGRAPH_DEX_ONLY)
--- a/src/ngraph/runtime/gpu/gpu_host_parameters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_host_parameters.hpp
--- a/src/ngraph/runtime/gpu/gpu_internal_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_internal_function.cpp
--- a/src/ngraph/runtime/gpu/gpu_internal_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_internal_function.hpp
--- a/src/ngraph/runtime/gpu/gpu_invoke.cpp
+++ b/src/ngraph/runtime/gpu/gpu_invoke.cpp
--- a/src/ngraph/runtime/gpu/gpu_invoke.hpp
+++ b/src/ngraph/runtime/gpu/gpu_invoke.hpp
--- a/src/ngraph/runtime/gpu/gpu_kernel_args.cpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_args.cpp
--- a/src/ngraph/runtime/gpu/gpu_kernel_args.hpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_args.hpp
--- a/src/ngraph/runtime/gpu/gpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_emitters.cpp
--- a/src/ngraph/runtime/gpu/gpu_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_emitters.hpp
--- a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
--- a/src/ngraph/runtime/gpu/gpu_memory_manager.hpp
+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.hpp
--- a/src/ngraph/runtime/gpu/gpu_op_annotations.hpp
+++ b/src/ngraph/runtime/gpu/gpu_op_annotations.hpp
--- a/src/ngraph/runtime/gpu/gpu_primitive_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_primitive_emitter.cpp
--- a/src/ngraph/runtime/gpu/gpu_primitive_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_primitive_emitter.hpp
--- a/src/ngraph/runtime/gpu/gpu_runtime_constructor.cpp
+++ b/src/ngraph/runtime/gpu/gpu_runtime_constructor.cpp
--- a/src/ngraph/runtime/gpu/gpu_runtime_constructor.hpp
+++ b/src/ngraph/runtime/gpu/gpu_runtime_constructor.hpp
--- a/src/ngraph/runtime/gpu/gpu_runtime_context.cpp
+++ b/src/ngraph/runtime/gpu/gpu_runtime_context.cpp
--- a/src/ngraph/runtime/gpu/gpu_runtime_context.hpp
+++ b/src/ngraph/runtime/gpu/gpu_runtime_context.hpp
--- a/src/ngraph/runtime/gpu/gpu_tensor.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor.cpp
--- a/src/ngraph/runtime/gpu/gpu_tensor.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor.hpp
--- a/src/ngraph/runtime/gpu/gpu_tensor_wrapper.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_wrapper.cpp
--- a/src/ngraph/runtime/gpu/gpu_tensor_wrapper.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_wrapper.hpp
--- a/src/ngraph/runtime/gpu/gpu_util.cpp
+++ b/src/ngraph/runtime/gpu/gpu_util.cpp
--- a/src/ngraph/runtime/gpu/gpu_util.hpp
+++ b/src/ngraph/runtime/gpu/gpu_util.hpp
--- a/src/ngraph/runtime/gpu/host_emitter.cpp
+++ b/src/ngraph/runtime/gpu/host_emitter.cpp
--- a/src/ngraph/runtime/gpu/host_emitter.hpp
+++ b/src/ngraph/runtime/gpu/host_emitter.hpp
--- a/src/ngraph/runtime/gpu/nvcc/example.cu.cpp
+++ b/src/ngraph/runtime/gpu/nvcc/example.cu.cpp
--- a/src/ngraph/runtime/gpu/nvcc/kernels.hpp
+++ b/src/ngraph/runtime/gpu/nvcc/kernels.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gpu
-        {
-            void example_kernel();
-        }
-    }
-}
--- a/src/ngraph/runtime/gpu/nvdiff.hpp
+++ b/src/ngraph/runtime/gpu/nvdiff.hpp
--- a/src/ngraph/runtime/gpu/nvrtc/helpers.hpp
+++ b/src/ngraph/runtime/gpu/nvrtc/helpers.hpp
--- a/src/ngraph/runtime/gpu/nvshape.hpp
+++ b/src/ngraph/runtime/gpu/nvshape.hpp
--- a/src/ngraph/runtime/gpu/op/batch_norm.cpp
+++ b/src/ngraph/runtime/gpu/op/batch_norm.cpp
--- a/src/ngraph/runtime/gpu/op/batch_norm.hpp
+++ b/src/ngraph/runtime/gpu/op/batch_norm.hpp
--- a/src/ngraph/runtime/gpu/op/op_tbl.hpp
+++ b/src/ngraph/runtime/gpu/op/op_tbl.hpp
--- a/src/ngraph/runtime/gpu/op/rnn.cpp
+++ b/src/ngraph/runtime/gpu/op/rnn.cpp
--- a/src/ngraph/runtime/gpu/op/rnn.hpp
+++ b/src/ngraph/runtime/gpu/op/rnn.hpp
--- a/src/ngraph/runtime/gpu/pass/gpu_batch_norm_cache.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_batch_norm_cache.cpp
--- a/src/ngraph/runtime/gpu/pass/gpu_batch_norm_cache.hpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_batch_norm_cache.hpp
--- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
--- a/src/ngraph/runtime/gpu/pass/gpu_layout.hpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.hpp
--- a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
--- a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.hpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.hpp
--- a/src/ngraph/runtime/gpu/pass/tensor_memory_reservation.cpp
+++ b/src/ngraph/runtime/gpu/pass/tensor_memory_reservation.cpp
--- a/src/ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp
+++ b/src/ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp
--- a/src/ngraph/runtime/gpu/type_info.cpp
+++ b/src/ngraph/runtime/gpu/type_info.cpp
--- a/src/ngraph/runtime/gpu/type_info.hpp
+++ b/src/ngraph/runtime/gpu/type_info.hpp
--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
--- a/src/ngraph/runtime/gpuh/CMakeLists.txt
+++ b/src/ngraph/runtime/gpuh/CMakeLists.txt
--- a/src/ngraph/runtime/gpuh/gpuh_backend.cpp
+++ b/src/ngraph/runtime/gpuh/gpuh_backend.cpp
--- a/src/ngraph/runtime/gpuh/gpuh_backend.hpp
+++ b/src/ngraph/runtime/gpuh/gpuh_backend.hpp
--- a/src/ngraph/runtime/gpuh/unit_test.manifest
+++ b/src/ngraph/runtime/gpuh/unit_test.manifest
--- a/src/tools/nbench/CMakeLists.txt
+++ b/src/tools/nbench/CMakeLists.txt
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/gpu_fusion.cpp
+++ b/test/gpu_fusion.cpp
--- a/test/gpu_test.cpp
+++ b/test/gpu_test.cpp