Commit 90a1f581 authored by Robert Kimball's avatar Robert Kimball Committed by Scott Cyphers

remove non-plaid nvidia GPU backend (#3491)

parent 11d61848
......@@ -170,10 +170,8 @@ option(NGRAPH_TOOLS_ENABLE "Control the building of tool" TRUE)
option(NGRAPH_CPU_ENABLE "Control the building of the CPU backend" TRUE)
option(NGRAPH_MLIR_ENABLE "Control the building of MLIR backend" FALSE)
option(NGRAPH_INTELGPU_ENABLE "Control the building of the Intel GPU backend with clDNN" FALSE)
option(NGRAPH_GPU_ENABLE "Control the building of the GPU backend" FALSE)
option(NGRAPH_INTERPRETER_ENABLE "Control the building of the INTERPRETER backend" TRUE)
option(NGRAPH_NOP_ENABLE "Control the building of the NOP backend" TRUE)
option(NGRAPH_GPUH_ENABLE "Control the building of the Hybrid GPU backend" FALSE)
option(NGRAPH_GENERIC_CPU_ENABLE "Enable build nGraph for generic CPU backend" FALSE)
option(NGRAPH_DEBUG_ENABLE "Enable output for NGRAPH_DEBUG statements" FALSE)
option(NGRAPH_DEPRECATED_ENABLE "Enable compiler deprecation pragmas for deprecated APIs (recommended only for development use)" FALSE)
......@@ -194,8 +192,7 @@ option(NGRAPH_DYNAMIC_COMPONENTS_ENABLE "Enable dynamic loading of components" T
if (NGRAPH_CPU_ENABLE
AND
((NOT NGRAPH_GPU_ENABLE) AND (NOT NGRAPH_GPUH_ENABLE)
AND (NOT NGRAPH_GENERIC_CPU_ENABLE) AND (NOT NGRAPH_INTELGPU_ENABLE))
((NOT NGRAPH_GENERIC_CPU_ENABLE) AND (NOT NGRAPH_INTELGPU_ENABLE))
)
set(NGRAPH_INTEL_CPU_ONLY_ENABLE ON)
endif()
......@@ -224,10 +221,6 @@ if (NGRAPH_DISTRIBUTED_ENABLE)
endif()
endif()
if (NGRAPH_GPUH_ENABLE)
set(NGRAPH_GPU_ENABLE TRUE)
endif()
if (NGRAPH_ONNX_IMPORT_ENABLE)
option(NGRAPH_USE_SYSTEM_PROTOBUF "Use system provided Protobuf shared object" FALSE)
option(NGRAPH_ONNXIFI_ENABLE "Enable ONNX Interface for Framework Integration" TRUE)
......@@ -250,10 +243,8 @@ NORMALIZE_BOOL(NGRAPH_TOOLS_ENABLE)
NORMALIZE_BOOL(NGRAPH_CPU_ENABLE)
NORMALIZE_BOOL(NGRAPH_MLIR_ENABLE)
NORMALIZE_BOOL(NGRAPH_INTELGPU_ENABLE)
NORMALIZE_BOOL(NGRAPH_GPU_ENABLE)
NORMALIZE_BOOL(NGRAPH_INTERPRETER_ENABLE)
NORMALIZE_BOOL(NGRAPH_NOP_ENABLE)
NORMALIZE_BOOL(NGRAPH_GPUH_ENABLE)
NORMALIZE_BOOL(NGRAPH_GENERIC_CPU_ENABLE)
NORMALIZE_BOOL(NGRAPH_DEBUG_ENABLE)
NORMALIZE_BOOL(NGRAPH_DEPRECATED_ENABLE)
......@@ -277,10 +268,8 @@ message(STATUS "NGRAPH_TOOLS_ENABLE: ${NGRAPH_TOOLS_ENABLE}")
message(STATUS "NGRAPH_CPU_ENABLE: ${NGRAPH_CPU_ENABLE}")
message(STATUS "NGRAPH_MLIR_ENABLE: ${NGRAPH_MLIR_ENABLE}")
message(STATUS "NGRAPH_INTELGPU_ENABLE: ${NGRAPH_INTELGPU_ENABLE}")
message(STATUS "NGRAPH_GPU_ENABLE: ${NGRAPH_GPU_ENABLE}")
message(STATUS "NGRAPH_INTERPRETER_ENABLE: ${NGRAPH_INTERPRETER_ENABLE}")
message(STATUS "NGRAPH_NOP_ENABLE: ${NGRAPH_NOP_ENABLE}")
message(STATUS "NGRAPH_GPUH_ENABLE: ${NGRAPH_GPUH_ENABLE}")
message(STATUS "NGRAPH_GENERIC_CPU_ENABLE: ${NGRAPH_GENERIC_CPU_ENABLE}")
message(STATUS "NGRAPH_DEBUG_ENABLE: ${NGRAPH_DEBUG_ENABLE}")
message(STATUS "NGRAPH_DEPRECATED_ENABLE: ${NGRAPH_DEPRECATED_ENABLE}")
......@@ -554,7 +543,7 @@ if(NGRAPH_INTELGPU_ENABLE)
include(cmake/external_cldnn.cmake)
endif()
if ((NGRAPH_GPU_ENABLE OR NGRAPH_CPU_ENABLE) AND NOT NGRAPH_DEX_ONLY)
if (NGRAPH_CPU_ENABLE AND NOT NGRAPH_DEX_ONLY)
set(NGRAPH_CODEGEN_ENABLE TRUE)
else()
set(NGRAPH_CODEGEN_ENABLE FALSE)
......
......@@ -83,7 +83,6 @@ endif
# Configuration for GPU backend in Dockerfiles with "_gpu" suffix
# The nvidia-docker command must be used for any targets that actually utilize GPU devices
ifneq ("$(shell echo ${NVIDIA_SMI} | grep nvidia-smi)","")
CMAKE_OPTIONS_EXTRA+=-DNGRAPH_GPU_ENABLE=TRUE
DOCKERFILE=${CPU_DOCKERFILE}_gpu
DOCKER_CMD=nvidia-docker
DOCKER_CMD_MESG=GPU appears to be supported on this platform. Building for GPU and CPU backend support.
......
......@@ -50,13 +50,6 @@ endif()
list(APPEND HEADER_SEARCH_DEFINES CLANG_BUILTIN_HEADERS_PATH="${CLANG_INCLUDE_DIR}")
list(APPEND HEADER_SEARCH_DEFINES NGRAPH_HEADERS_PATH="${NGRAPH_INCLUDE_PATH}")
if(NGRAPH_GPU_ENABLE)
find_package(CUDA 8 REQUIRED)
find_package(CUDNN 7 REQUIRED)
list(APPEND HEADER_SEARCH_DEFINES CUDA_HEADER_PATHS="${CUDA_INCLUDE_DIRS}")
list(APPEND HEADER_SEARCH_DEFINES CUDNN_HEADER_PATHS="${CUDNN_INCLUDE_DIRS}")
endif()
if(NGRAPH_TBB_ENABLE)
get_target_property(TBB_INCLUDE_DIR libtbb INTERFACE_INCLUDE_DIRECTORIES)
list(APPEND HEADER_SEARCH_DEFINES TBB_HEADERS_PATH="${TBB_INCLUDE_DIR}")
......
......@@ -27,18 +27,10 @@ if (NGRAPH_INTELGPU_ENABLE)
add_subdirectory(intelgpu)
endif()
if (NGRAPH_GPU_ENABLE)
add_subdirectory(gpu)
endif()
if (NGRAPH_NOP_ENABLE)
add_subdirectory(nop)
endif()
if (NGRAPH_GPUH_ENABLE)
add_subdirectory(gpuh)
endif()
if (NGRAPH_GENERIC_CPU_ENABLE)
add_subdirectory(generic_cpu)
endif()
......
# ******************************************************************************
# Copyright 2017-2019 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})
# Add sources for the GPU backend
# and all its dependencies
set(SRC
cuda_emitter.cpp
cudnn_emitter.cpp
cublas_emitter.cpp
host_emitter.cpp
gpu_backend.cpp
gpu_call_frame.cpp
gpu_cuda_context_manager.cpp
gpu_cuda_function_builder.cpp
gpu_cuda_function_pool.cpp
gpu_cuda_kernel_builder.cpp
gpu_emitter.cpp
gpu_compiled_function.cpp
gpu_internal_function.cpp
gpu_invoke.cpp
gpu_kernel_args.cpp
gpu_kernel_emitters.cpp
gpu_memory_manager.cpp
gpu_primitive_emitter.cpp
gpu_runtime_constructor.cpp
gpu_runtime_context.cpp
gpu_tensor_wrapper.cpp
gpu_tensor.cpp
gpu_util.cpp
type_info.cpp
pass/gpu_batch_norm_cache.cpp
pass/gpu_layout.cpp
pass/gpu_rnn_fusion.cpp
pass/tensor_memory_reservation.cpp
op/batch_norm.cpp
op/rnn.cpp
)
if (NOT NGRAPH_DEX_ONLY)
list(APPEND SRC gpu_external_function.cpp)
endif()
set(CUDA_INC
${PROJECT_SOURCE_DIR}/src/
)
set(CUDA_SRC
nvcc/example.cu.cpp
)
if (NGRAPH_GPU_ENABLE)
find_package(CUDA 9 QUIET)
if (CUDA_FOUND)
set(CUDA9_FOUND TRUE)
message(STATUS "Found CUDA 9")
else()
find_package(CUDA 8 REQUIRED)
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
--compiler-options -fPIC;
-arch=sm_30;
-gencode=arch=compute_35,code=sm_35;
-gencode=arch=compute_50,code=sm_50;
-gencode=arch=compute_52,code=sm_52;
-gencode=arch=compute_60,code=sm_60;
-gencode=arch=compute_61,code=sm_61;
-gencode=arch=compute_61,code=compute_61)
if (CUDA9_FOUND)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
-gencode=arch=compute_62,code=sm_62;
-gencode=arch=compute_70,code=sm_70;
-gencode=arch=compute_70,code=compute_70)
endif()
set (DO_CUDA_COMPILE FALSE)
if (CUDA9_FOUND)
if (("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU"))
# CUDA 9 supports up to gcc 6.x
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
set (DO_CUDA_COMPILE TRUE)
else()
message(STATUS "NVCC will not be used because CUDA 9 only supports up to gcc 6.x")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
# CUDA 9 supports up to clang 3.9
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0)
set (DO_CUDA_COMPILE TRUE)
else()
message(STATUS "NVCC will not be used because CUDA 9 only supports up to clang 3.9")
endif()
endif()
else()
# CUDA 8 (minimum version of CUDA we support)
if (("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU"))
# Current release of CUDA 8 supports up to gcc 5.4
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.5)
set (DO_CUDA_COMPILE TRUE)
else()
message(STATUS "NVCC will not be used because CUDA 8 only supports up to gcc 5.4")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
# CUDA 8 supports up to clang 3.8
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.9)
set (DO_CUDA_COMPILE TRUE)
else()
message(STATUS "NVCC will not be used because CUDA 8 only supports up to clang 3.8")
endif()
endif()
endif()
if (DO_CUDA_COMPILE)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
# CUDA_PROPAGATE_HOST_FLAGS is true by default, so disable
# clang warnings that are known to flag CUDA code
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
--compiler-options -Wno-reserved-id-macro;
--compiler-options -Wno-undef;
--compiler-options -Wno-old-style-cast;
--compiler-options -Wno-deprecated;
--compiler-options -Wno-unused-macros;
--compiler-options -Wno-used-but-marked-unused)
endif()
message(STATUS "Precompiling static CUDA kernels via NVCC")
set_source_files_properties( ${CUDA_SRC} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
cuda_include_directories(${CUDA_INC})
cuda_compile(CUDA_OBJ ${CUDA_SRC} STATIC)
else()
message(STATUS "Not precompiling static CUDA kernels via NVCC; runtime compilation via NVRTC will be used.")
endif()
add_library(gpu_backend SHARED ${SRC} ${CUDA_OBJ})
target_compile_definitions(gpu_backend PRIVATE GPU_BACKEND_EXPORTS)
if(NGRAPH_LIB_VERSIONING_ENABLE)
set_target_properties(gpu_backend PROPERTIES
VERSION ${NGRAPH_VERSION}
SOVERSION ${NGRAPH_API_VERSION})
endif()
target_link_libraries(gpu_backend PUBLIC ngraph)
if (NGRAPH_DEX_ONLY)
target_compile_definitions(gpu_backend PRIVATE "NGRAPH_DEX_ONLY")
else()
target_link_libraries(gpu_backend PUBLIC codegen)
endif()
find_library(CUDA_nvrtc_LIBRARY nvrtc
PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
find_library(CUDA_cuda_LIBRARY cuda
PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64 cuda/lib64/stubs)
find_library(CUDA_cudart_LIBRARY ${CMAKE_STATIC_LIBRARY_PREFIX}cudart_static${CMAKE_STATIC_LIBRARY_SUFFIX}
PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
find_package(CUDNN 7 REQUIRED)
target_include_directories(gpu_backend SYSTEM PUBLIC ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
target_link_libraries(gpu_backend
PUBLIC
${CUDA_cuda_LIBRARY}
${CUDA_nvrtc_LIBRARY}
${CUDA_cudart_LIBRARY}
${CUDA_LIBRARIES}
${CUDA_CUBLAS_LIBRARIES}
${CUDNN_LIBRARIES})
install(TARGETS gpu_backend
ARCHIVE DESTINATION ${NGRAPH_INSTALL_LIB}
LIBRARY DESTINATION ${NGRAPH_INSTALL_LIB})
endif()
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "ngraph/runtime/gpu/cublas_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/util.hpp"
using namespace ngraph;
runtime::gpu::CUBLASEmitter::CUBLASEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx)
: m_primitive_emitter(emitter)
{
m_ctx = ctx;
}
size_t runtime::gpu::CUBLASEmitter::build_dot(const element::Type& dtype,
const Shape& arg0_shape,
const Shape& arg1_shape,
const Shape& out_shape,
size_t reduction_axes,
const Node* node)
{
std::stringstream ss;
ss << "dot_op"
<< "_dtype_" << dtype.c_type_string() << "_reduction_axes_count_" << reduction_axes;
std::string hash = ss.str() + "_i_" + join(arg0_shape, "_") + "_i_" + join(arg1_shape, "_");
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
std::unique_ptr<gpu::primitive> dot;
if (arg0_shape.empty() || arg1_shape.empty())
{
auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape);
size_t count = shape_size(second);
size_t firstIndex = (arg0_shape.empty() ? 0 : 1);
size_t secondIndex = (arg0_shape.empty() ? 1 : 0);
dot.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
CUBLAS_SAFE_CALL(cublasScopy(*m_ctx->cublas_handle,
count,
static_cast<const float*>(inputs[secondIndex]),
1,
static_cast<float*>(outputs[0]),
1));
CUBLAS_SAFE_CALL(cublasSscal(*m_ctx->cublas_handle,
count,
static_cast<const float*>(inputs[firstIndex]),
static_cast<float*>(outputs[0]),
1));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->register_primitive(dot, hash);
}
// case that can be treat as dot1d
else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes))
{
for (int i = 0; i < arg0_shape.size(); i++)
{
if (arg0_shape[i] != arg1_shape[i])
{
std::vector<std::string> arg_vec{"arg0", "arg1"};
std::vector<Shape> shape_vec{arg0_shape, arg1_shape};
throw std::invalid_argument(get_error_string(arg_vec, shape_vec, node));
}
}
size_t count = shape_size(arg0_shape);
dot.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
CUBLAS_SAFE_CALL(cublasSdot(*m_ctx->cublas_handle,
count,
static_cast<const float*>(inputs[0]),
1,
static_cast<const float*>(inputs[1]),
1,
static_cast<float*>(outputs[0])));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->register_primitive(dot, hash);
}
// matrix vector
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
{
dot.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
const float alpha = 1.0;
const float beta = 0;
CUBLAS_SAFE_CALL(cublasSetPointerMode(*m_ctx->cublas_handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_SAFE_CALL(cublasSgemv(*m_ctx->cublas_handle,
CUBLAS_OP_T,
arg0_shape[1],
arg0_shape[0],
&alpha,
static_cast<const float*>(inputs[0]),
arg0_shape[1],
static_cast<const float*>(inputs[1]),
1,
&beta,
static_cast<float*>(outputs[0]),
1));
CUBLAS_SAFE_CALL(
cublasSetPointerMode(*m_ctx->cublas_handle, CUBLAS_POINTER_MODE_DEVICE));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->register_primitive(dot, hash);
}
else
{
size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
size_t axes_for_k_count = reduction_axes;
size_t m = 1;
size_t n = 1;
size_t k = 1;
// check if input and output size correct
// check and calculate k for arg0 and arg1
size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
size_t arg1_k_idx = 0; // first axe in arg1 for k
for (size_t i = 0; i < axes_for_k_count; i++)
{
k *= arg0_shape[arg0_k_idx];
if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
{
std::vector<std::string> arg_vec{"arg0", "arg1"};
std::vector<Shape> shape_vec{arg0_shape, arg1_shape};
throw std::invalid_argument(get_error_string(arg_vec, shape_vec, node));
}
}
// check and calculate m for arg0 and out
size_t arg0_m_idx = 0; // first axe in arg0 for m
size_t out_m_idx = 0; // first axe in out for m
for (size_t i = 0; i < axes_for_m_count; i++)
{
m *= arg0_shape[arg0_m_idx];
if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
{
std::vector<std::string> arg_vec{"arg0", "output"};
std::vector<Shape> shape_vec{arg0_shape, out_shape};
throw std::invalid_argument(get_error_string(arg_vec, shape_vec, node));
}
}
// check and calculate n for arg1 and out
size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n
for (size_t i = 0; i < axes_for_n_count; i++)
{
n *= arg1_shape[arg1_n_idx];
if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
{
std::vector<std::string> arg_vec{"arg1", "output"};
std::vector<Shape> shape_vec{arg1_shape, out_shape};
throw std::invalid_argument(get_error_string(arg_vec, shape_vec, node));
}
}
dot.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
const float alpha = 1.0;
const float beta = 0;
CUBLAS_SAFE_CALL(cublasSetPointerMode(*m_ctx->cublas_handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_SAFE_CALL(cublasSgemm(*m_ctx->cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
static_cast<const float*>(inputs[1]),
n,
static_cast<const float*>(inputs[0]),
k,
&beta,
static_cast<float*>(outputs[0]),
n));
CUBLAS_SAFE_CALL(
cublasSetPointerMode(*m_ctx->cublas_handle, CUBLAS_POINTER_MODE_DEVICE));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->register_primitive(dot, hash);
}
return primitive_index;
}
void runtime::gpu::CUBLASEmitter::sync()
{
CUDA_RT_SAFE_CALL(cudaDeviceSynchronize());
return;
}
void runtime::gpu::CUBLASEmitter::debug_sync()
{
#ifdef NGRAPH_DEBUG_ENABLE
CUDA_RT_SAFE_CALL(cudaDeviceSynchronize());
#endif
return;
}
std::string runtime::gpu::CUBLASEmitter::get_error_string(std::vector<std::string>& arg_names,
std::vector<Shape>& shapes,
const Node* node)
{
std::stringstream ss_err;
ss_err << ngraph::join(arg_names) << " with " << ngraph::join(shapes)
<< " respectively, at Node " << node->get_name() << ", do not match for dot op";
return ss_err.str();
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <cublas_v2.h>
#include "ngraph/op/dot.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/shape.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class GPUPrimitiveEmitter;
class CUBLASEmitter
{
friend class GPUPrimitiveEmitter;
public:
size_t build_dot(const element::Type& dtype,
const Shape& arg0_shape,
const Shape& arg1_shape,
const Shape& out_shape,
size_t reduction_axes,
const Node* node);
void debug_sync();
void sync();
private:
CUBLASEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx);
GPUPrimitiveEmitter* m_primitive_emitter;
GPURuntimeContext* m_ctx;
std::string get_error_string(std::vector<std::string>& arg_names,
std::vector<Shape>& shapes,
const Node* node);
};
}
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <memory>
#include <vector>
#include <cudnn.h>
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
template <typename T>
struct cudnn_descriptor;
/// \brief A factory which builds cuDNN descriptors
/// and manages their creation and destruction.
class CUDNNDescriptors
{
class Descriptor
{
public:
Descriptor() = default;
virtual ~Descriptor() = default;
};
public:
CUDNNDescriptors() = default;
~CUDNNDescriptors() = default;
template <typename T>
T& build()
{
// A function-local class for type erased storage of
// various cudnn descriptor types which is templated
// over function `build`'s specialization type
class descriptor_ : public Descriptor
{
public:
descriptor_() { cudnn_descriptor<T>::create(stored_descriptor); }
~descriptor_() { cudnn_descriptor<T>::destroy(stored_descriptor); }
T& get() { return stored_descriptor; }
private:
T stored_descriptor;
};
m_descriptors.emplace_back(new descriptor_);
return static_cast<descriptor_*>(m_descriptors.back().get())->get();
}
private:
std::vector<std::unique_ptr<Descriptor>> m_descriptors;
};
template <>
struct cudnn_descriptor<cudnnLRNDescriptor_t>
{
static void create(cudnnLRNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateLRNDescriptor(&desc));
}
static void destroy(cudnnLRNDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyLRNDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnTensorDescriptor_t>
{
static void create(cudnnTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
}
static void destroy(cudnnTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnSpatialTransformerDescriptor_t>
{
static void create(cudnnSpatialTransformerDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateSpatialTransformerDescriptor(&desc));
}
static void destroy(cudnnSpatialTransformerDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroySpatialTransformerDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnReduceTensorDescriptor_t>
{
static void create(cudnnReduceTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateReduceTensorDescriptor(&desc));
}
static void destroy(cudnnReduceTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyReduceTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnRNNDescriptor_t>
{
static void create(cudnnRNNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateRNNDescriptor(&desc));
}
static void destroy(cudnnRNNDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyRNNDescriptor(desc));
}
};
#if CUDNN_VERSION >= 7200
template <>
struct cudnn_descriptor<cudnnRNNDataDescriptor_t>
{
static void create(cudnnRNNDataDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateRNNDataDescriptor(&desc));
}
static void destroy(cudnnRNNDataDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyRNNDataDescriptor(desc));
}
};
#endif
template <>
struct cudnn_descriptor<cudnnPoolingDescriptor_t>
{
static void create(cudnnPoolingDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreatePoolingDescriptor(&desc));
}
static void destroy(cudnnPoolingDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyPoolingDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnOpTensorDescriptor_t>
{
static void create(cudnnOpTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&desc));
}
static void destroy(cudnnOpTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyOpTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnFilterDescriptor_t>
{
static void create(cudnnFilterDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateFilterDescriptor(&desc));
}
static void destroy(cudnnFilterDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyFilterDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnDropoutDescriptor_t>
{
static void create(cudnnDropoutDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateDropoutDescriptor(&desc));
}
static void destroy(cudnnDropoutDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyDropoutDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnConvolutionDescriptor_t>
{
static void create(cudnnConvolutionDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateConvolutionDescriptor(&desc));
}
static void destroy(cudnnConvolutionDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyConvolutionDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnCTCLossDescriptor_t>
{
static void create(cudnnCTCLossDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateCTCLossDescriptor(&desc));
}
static void destroy(cudnnCTCLossDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyCTCLossDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnActivationDescriptor_t>
{
static void create(cudnnActivationDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateActivationDescriptor(&desc));
}
static void destroy(cudnnActivationDescriptor_t& desc)
{
CUDNN_SAFE_CALL_NO_THROW(cudnnDestroyActivationDescriptor(desc));
}
};
}
}
}
This diff is collapsed.
This diff is collapsed.
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <list>
#include <memory>
#include <cudnn.h>
#include "ngraph/log.hpp"
#include "ngraph/runtime/gpu/gpu_host_parameters.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
/// \brief A factory which builds cuDNN host parameters
/// and manages their creation and destruction.
class CUDNNHostParameters
{
public:
CUDNNHostParameters(const std::shared_ptr<GPUHostParameters> params)
: m_host_parameters(params)
{
}
~CUDNNHostParameters() = default;
void* allocate_by_datatype(const cudnnDataType_t data_type, const double value)
{
void* r = nullptr;
switch (data_type)
{
case CUDNN_DATA_FLOAT:
r = m_host_parameters->cache(static_cast<float>(value));
break;
case CUDNN_DATA_DOUBLE:
r = m_host_parameters->cache(static_cast<double>(value));
break;
case CUDNN_DATA_INT8:
r = m_host_parameters->cache(static_cast<int8_t>(value));
break;
case CUDNN_DATA_INT32:
r = m_host_parameters->cache(static_cast<int32_t>(value));
break;
case CUDNN_DATA_HALF:
case CUDNN_DATA_INT8x4:
case CUDNN_DATA_UINT8:
case CUDNN_DATA_UINT8x4:
default:
throw std::runtime_error(
"Encountered unhandled cudnnDataType_t during compilation.");
}
return r;
}
private:
std::shared_ptr<GPUHostParameters> m_host_parameters;
};
}
}
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <string>
#include "cudnn_invoke.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
extern "C" void ngraph::runtime::gpu::cudnn_utils::cudnn_invoke_primitive(GPURuntimeContext* ctx,
size_t primitive_index,
void** args,
void** result)
{
(*ctx->cudnn_primitives[primitive_index])(args, result);
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <cstddef>
namespace ngraph
{
namespace runtime
{
namespace gpu
{
struct GPURuntimeContext;
namespace cudnn_utils
{
extern "C" void cudnn_invoke_primitive(GPURuntimeContext* ctx,
size_t primitive_index,
void** args,
void** result);
}
}
}
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include "ngraph/graph_util.hpp"
#include "ngraph/op/batch_norm.hpp"
#include "ngraph/runtime/backend_manager.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_internal_function.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp"
using namespace ngraph;
using namespace std;
extern "C" runtime::BackendConstructor* get_backend_constructor_pointer()
{
class LocalBackendConstructor : public runtime::BackendConstructor
{
public:
std::shared_ptr<runtime::Backend> create(const std::string& config) override
{
return std::make_shared<runtime::gpu::GPU_Backend>();
}
};
static unique_ptr<runtime::BackendConstructor> s_backend_constructor(
new LocalBackendConstructor());
return s_backend_constructor.get();
}
runtime::gpu::GPU_Backend::GPU_Backend()
: runtime::Backend()
{
}
runtime::gpu::GPU_Backend::BackendContext::BackendContext()
: m_runtime_context(new GPURuntimeContext)
, m_primitive_emitter(new GPUPrimitiveEmitter(m_runtime_context))
, m_cuda_manager(new CudaContextManager)
{
// Create context use driver API and make it current, the runtime call will pickup the context
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
// #interoperability-between-runtime-and-driver-apis
bind_cuda_context_to_thread();
m_runtime_context->cublas_handle = new cublasHandle_t;
cublasStatus_t cublasStatus = cublasCreate(m_runtime_context->cublas_handle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS)
{
throw runtime_error("cuBLAS create handle failed");
}
// Pass scalars as reference on the Device
cublasSetPointerMode(*m_runtime_context->cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
m_runtime_context->cudnn_handle = new cudnnHandle_t;
cudnnStatus_t cudnnStatus = cudnnCreate(m_runtime_context->cudnn_handle);
if (cudnnStatus != CUDNN_STATUS_SUCCESS)
{
throw runtime_error("cuDNN create handle failed");
}
// register with c-api runtime context
m_runtime_context->compiled_kernel_pool = new CudaFunctionPool;
}
void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context()
{
// set context current each time in case thread changed
bind_cuda_context_to_thread();
// add pointers to gpu primitives into the gpu runtime context
m_runtime_context->gpu_primitives = m_primitive_emitter->get_primitives().data();
m_runtime_context->gpu_memory_primitives = m_primitive_emitter->get_memory_primitives().data();
}
void runtime::gpu::GPU_Backend::BackendContext::bind_cuda_context_to_thread()
{
m_cuda_manager->SetContextCurrent();
}
runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
{
cublasDestroy(*m_runtime_context->cublas_handle);
delete m_runtime_context->cublas_handle;
cudnnDestroy(*m_runtime_context->cudnn_handle);
delete m_runtime_context->cudnn_handle;
delete m_runtime_context->compiled_kernel_pool;
}
shared_ptr<runtime::Tensor>
runtime::gpu::GPU_Backend::create_tensor(const element::Type& element_type, const Shape& shape)
{
return make_shared<runtime::gpu::GPUTensor>(element_type, shape);
}
shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor(
const element::Type& element_type, const Shape& shape, void* memory_pointer)
{
if (memory_pointer != nullptr && !is_device_pointer(memory_pointer))
{
throw ngraph_error("The pointer passed to create_tensor is not a device pointer.");
}
return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer);
}
shared_ptr<runtime::Executable> runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func,
bool timing_enable)
{
shared_ptr<runtime::Executable> rc;
auto it = m_exec_map.find(func);
if (it != m_exec_map.end())
{
rc = it->second;
}
else
{
rc = make_shared<GPU_Executable>(func, timing_enable);
m_exec_map.insert({func, rc});
}
return rc;
}
runtime::gpu::GPU_Executable::GPU_Executable(shared_ptr<Function> func, bool enable_timing)
: m_context(new GPU_Backend::BackendContext())
{
FunctionInstance& instance = m_function_instance;
if (instance.m_compiled_function == nullptr)
{
m_context->bind_cuda_context_to_thread();
instance.m_compiled_function = runtime::gpu::GPUCompiledFunction::make(func, m_context);
instance.m_compiled_function->m_emit_timing = enable_timing;
instance.m_compiled_function->compile();
instance.m_runtime = instance.m_compiled_function->m_runtime;
instance.m_inputs.resize(func->get_parameters().size());
instance.m_outputs.resize(func->get_output_size());
}
set_parameters_and_results(*func);
}
void runtime::gpu::GPU_Executable::initialize_io(void** target,
const vector<shared_ptr<runtime::Tensor>>& source)
{
for (size_t i = 0; i < source.size(); i++)
{
shared_ptr<runtime::gpu::GPUTensor> tv =
dynamic_pointer_cast<runtime::gpu::GPUTensor>(source[i]);
if (tv)
{
target[i] = tv->m_allocated_buffer_pool;
}
else
{
throw invalid_argument("Tensors passed to GPU backend must be GPU Tensors");
}
}
}
bool runtime::gpu::GPU_Executable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
{
FunctionInstance& instance = m_function_instance;
if (instance.m_compiled_function == nullptr)
{
throw runtime_error("compile() must be called before call().");
}
// ensure the GPURuntimeContext primitive pointers are valid
m_context->prepare_runtime_context();
// Device tensors
initialize_io(instance.m_inputs.data(), inputs);
initialize_io(instance.m_outputs.data(), outputs);
auto ctx = m_context->m_runtime_context.get();
instance.m_runtime(instance.m_inputs.data(), instance.m_outputs.data(), ctx);
return true;
}
// void runtime::gpu::GPU_Backend::remove_compiled_function(shared_ptr<Function> func)
// {
// m_function_map.erase(func);
// }
vector<runtime::PerformanceCounter> runtime::gpu::GPU_Executable::get_performance_data() const
{
std::vector<runtime::PerformanceCounter> rc;
const FunctionInstance& instance = m_function_instance;
if (instance.m_compiled_function != nullptr)
{
instance.m_compiled_function->get_performance_data(rc);
}
return rc;
}
bool runtime::gpu::GPU_Backend::is_supported(const Node& op) const
{
set<string> unsupported_ops = {"Quantize",
"Dequantize",
"DynReplaceSlice",
"DynReshape",
"DynSlice",
"ShapeOf",
"All",
"Any",
"AllReduce",
"BatchMatMul",
"DynPad"
"SelectAndScatter",
"StopGradient",
"EmbeddingLookup",
"GenerateMask",
"DynBroadcast",
"Transpose",
"Range",
"Recv",
"Send"};
set<string> float_only = {"MaxPoolBackprop", "AvgPoolBackprop", "MaxPool", "Dot"};
if (unsupported_ops.find(op.description()) != unsupported_ops.end())
{
return false;
}
if (float_only.find(op.description()) != float_only.end())
{
if (op.get_output_element_type(0) != element::f32 &&
op.get_output_element_type(0) != element::f64)
{
return false;
}
}
if (op.description() == "BatchNormInference")
{
const ngraph::op::BatchNormInference* bn =
static_cast<const ngraph::op::BatchNormInference*>(&op);
if (bn->get_eps_value() < CUDNN_BN_MIN_EPSILON)
{
return false;
}
}
else if (op.description() == "BatchNormTraining")
{
const ngraph::op::BatchNormTraining* bn =
static_cast<const ngraph::op::BatchNormTraining*>(&op);
if (bn->get_eps_value() < CUDNN_BN_MIN_EPSILON)
{
return false;
}
}
return true;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <map>
#include <memory>
#include "ngraph/runtime/backend.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
static size_t alignment = 64;
class GPUCompiledFunction;
class GPUPrimitiveEmitter;
struct GPURuntimeContext;
class CudaContextManager;
using EntryPoint_t = void(void** inputs, void** outputs, GPURuntimeContext* ctx);
using EntryPoint = std::function<EntryPoint_t>;
class GPU_Backend : public Backend
{
public:
GPU_Backend();
std::shared_ptr<ngraph::runtime::Tensor>
create_tensor(const ngraph::element::Type& element_type,
const Shape& shape,
void* memory_pointer) override;
std::shared_ptr<ngraph::runtime::Tensor>
create_tensor(const ngraph::element::Type& element_type,
const Shape& shape) override;
std::shared_ptr<runtime::Executable> compile(std::shared_ptr<Function> func,
bool timing_enabled = false) override;
bool is_supported(const Node& node) const override;
class BackendContext
{
public:
BackendContext();
~BackendContext();
void prepare_runtime_context();
void bind_cuda_context_to_thread();
std::unique_ptr<GPURuntimeContext> m_runtime_context;
std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
private:
std::unique_ptr<CudaContextManager> m_cuda_manager;
};
private:
std::map<std::shared_ptr<Function>, std::shared_ptr<Executable>> m_exec_map;
};
class GPU_Executable : public Executable
{
public:
GPU_Executable(std::shared_ptr<Function> func, bool enable_timing);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
// void remove_compiled_function(std::shared_ptr<Function> func) override;
std::vector<PerformanceCounter> get_performance_data() const override;
private:
class FunctionInstance
{
public:
std::shared_ptr<GPUCompiledFunction> m_compiled_function;
bool m_performance_counters_enabled = false;
EntryPoint m_runtime;
std::vector<void*> m_inputs;
std::vector<void*> m_outputs;
} m_function_instance;
/// \brief Convert a vector of Tensor into a vector of void* where each void*
/// points to a Tensor's data buffer.
/// \param target Pointer to a pre-allocated array of void* with
/// size >= source.size()
/// \param source Source vector of Tensors
static void
initialize_io(void** target,
const std::vector<std::shared_ptr<runtime::Tensor>>& source);
std::shared_ptr<GPU_Backend::BackendContext> m_context;
};
}
}
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
using namespace ngraph;
runtime::gpu::GPUCallFrame::GPUCallFrame(const size_t& num_inputs, const size_t& num_outputs)
: m_inputs(num_inputs, nullptr)
, m_outputs(num_outputs, nullptr)
{
}
void runtime::gpu::GPUCallFrame::resolve_reservations(
const GPUCompiledFunction* compiled_function,
const std::unordered_map<std::string, size_t>& memory_reservations)
{
auto& mem_primitives = compiled_function->get_primitive_emitter()->get_memory_primitives();
for (auto const& p : memory_reservations)
{
// mem_primitives may return pointers for constant or workspace reservations
m_memory_reservations[p.first] = static_cast<unsigned char*>(mem_primitives.at(p.second)());
}
}
void runtime::gpu::GPUCallFrame::resolve_inputs(void** inputs, size_t num_inputs)
{
// num_inputs is > 0 iff we are resolving inputs from a nested function call
if (num_inputs == 0)
{
num_inputs = m_inputs.size();
}
for (size_t i = 0; i < num_inputs; i++)
{
void* input = inputs[i];
m_inputs[i] = static_cast<unsigned char*>(input);
}
}
void runtime::gpu::GPUCallFrame::resolve_outputs(void** outputs, size_t num_outputs)
{
// num_outputs is > 0 iff we are resolving outputs from a nested function call
if (num_outputs == 0)
{
num_outputs = m_outputs.size();
}
for (size_t i = 0; i < num_outputs; i++)
{
void* output = outputs[i];
m_outputs[i] = static_cast<unsigned char*>(output);
}
}
// returns pointers of any TensorRole
std::vector<void*>
runtime::gpu::GPUCallFrame::get_tensor_io(const std::vector<GPUTensorWrapper>& tensors)
{
std::vector<void*> ptrs;
for (auto const& tensor : tensors)
{
auto offset = tensor.get_offset();
auto ptr = get_pointer(offset.first, offset.second, tensor.get_name());
ptrs.push_back(ptr);
}
return ptrs;
}
void* runtime::gpu::GPUCallFrame::get_pointer(const TensorRole& type,
const size_t& offset,
const std::string& name)
{
switch (type)
{
case TensorRole::CONSTANT:
case TensorRole::INTERMEDIATE:
return static_cast<void*>(m_memory_reservations.at(name) + offset);
case TensorRole::INPUT: return static_cast<void*>(m_inputs.at(offset));
case TensorRole::OUTPUT: return static_cast<void*>(m_outputs.at(offset));
case TensorRole::UNKNOWN:
default: throw ngraph_error("GPUCallFrame encountered unknown or uninitialized tensor type");
};
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <functional>
#include <memory>
#include <unordered_map>
#include "ngraph/runtime/gpu/gpu_compiled_function.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_wrapper.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class GPUCallFrame
{
public:
GPUCallFrame(const size_t& num_inputs, const size_t& num_outputs);
void resolve_reservations(
const GPUCompiledFunction* compiled_function,
const std::unordered_map<std::string, size_t>& memory_reservations);
void resolve_inputs(void** inputs, size_t num_inputs = 0);
void resolve_outputs(void** outputs, size_t num_outputs = 0);
std::vector<void*> get_tensor_io(const std::vector<GPUTensorWrapper>& tensors);
private:
void* get_pointer(const TensorRole& type,
const size_t& offset,
const std::string& name = "");
std::unordered_map<std::string, unsigned char*> m_memory_reservations;
std::vector<unsigned char*> m_inputs;
std::vector<unsigned char*> m_outputs;
};
}
}
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <algorithm>
#include <cstdlib>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include <fstream>
#include <locale>
#include <mutex>
#include <string>
#include <tuple>
#include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
#include "ngraph/descriptor/output.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/function.hpp"
#include "ngraph/node.hpp"
#include "ngraph/pass/algebraic_simplification.hpp"
#include "ngraph/pass/fused_op_decomposition.hpp"
#include "ngraph/pass/get_output_element_elimination.hpp"
#include "ngraph/pass/implicit_broadcast_elimination.hpp"
#include "ngraph/pass/like_replacement.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_compiled_function.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_internal_function.hpp"
#include "ngraph/runtime/gpu/op/batch_norm.hpp"
#include "ngraph/runtime/gpu/op/rnn.hpp"
#include "ngraph/runtime/gpu/pass/gpu_batch_norm_cache.hpp"
#include "ngraph/runtime/gpu/pass/gpu_layout.hpp"
#include "ngraph/runtime/gpu/pass/gpu_rnn_fusion.hpp"
#include "ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp"
using namespace std;
using namespace ngraph;
std::string runtime::gpu::GPUCompiledFunction::get_output_dir()
{
static std::string output_dir = "gpu_codegen";
return output_dir;
}
size_t runtime::gpu::GPUCompiledFunction::get_memory_alignment()
{
static size_t memory_pool_alignment = 64;
return memory_pool_alignment;
}
static std::mutex s_compilation;
class GPUStaticInitializers
{
public:
GPUStaticInitializers()
{
file_util::remove_directory(runtime::gpu::GPUCompiledFunction::get_output_dir());
file_util::make_directory(runtime::gpu::GPUCompiledFunction::get_output_dir());
}
};
static GPUStaticInitializers s_static_initializers;
runtime::gpu::GPUCompiledFunction::GPUCompiledFunction(
const shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
: m_runtime(nullptr)
, m_function(function)
, m_emit_timing(false)
, m_is_compiled(false)
, m_shared_context(shared_context)
{
}
runtime::gpu::GPUCompiledFunction::~GPUCompiledFunction()
{
}
std::vector<std::string> get_case_variants(std::vector<std::string> cases)
{
std::vector<std::string> results;
for (auto& c : cases)
{
results.push_back(c);
if (std::all_of(c.begin(), c.end(), ::isdigit))
{
continue;
}
for (auto i = 0u; i < c.size(); i++)
{
c[i] = std::toupper(c[i], std::locale());
if (i == 0)
{
results.emplace_back(c);
}
}
results.emplace_back(c);
}
return results;
}
std::shared_ptr<runtime::gpu::GPUCompiledFunction> runtime::gpu::GPUCompiledFunction::make(
const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
{
#if defined(NGRAPH_DEX_ONLY)
return std::make_shared<runtime::gpu::GPUInternalFunction>(function, shared_context);
#else
// For now codegen is default unless explicitly disabled
bool use_codegen = true;
if (auto env = std::getenv("NGRAPH_CODEGEN"))
{
std::string env_codegen(env);
for (auto& opt : get_case_variants({"0", "false"}))
{
if (env_codegen == opt)
{
use_codegen = false;
}
}
}
if (use_codegen)
{
return std::make_shared<runtime::gpu::GPUExternalFunction>(function, shared_context);
}
else
{
return std::make_shared<runtime::gpu::GPUInternalFunction>(function, shared_context);
}
#endif
}
void runtime::gpu::GPUCompiledFunction::compile()
{
if (m_is_compiled)
{
return;
}
std::unique_lock<std::mutex> lock(s_compilation);
m_function_name = m_function->get_name();
auto allocator = std::make_shared<runtime::gpu::GPUAllocator>(
m_shared_context->m_primitive_emitter->get_memory_allocator());
ngraph::pass::Manager pass_manager;
#if CUDNN_VERSION >= 7200
// recurrent network fusion
pass_manager.register_pass<runtime::gpu::pass::LSTMFusion>();
pass_manager.register_pass<runtime::gpu::pass::RNNFusion>();
pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
pass_manager.register_pass<runtime::gpu::pass::MultiLayerRNNFusion>();
#else
pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
#endif
pass_manager.register_pass<runtime::gpu::pass::BatchNormCache>();
pass_manager.register_pass<ngraph::pass::LikeReplacement>();
pass_manager.register_pass<ngraph::pass::FusedOpDecomposition>();
pass_manager.register_pass<ngraph::pass::ImplicitBroadcastElimination>();
pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
pass_manager.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
pass_manager.register_pass<ngraph::pass::Liveness>();
pass_manager.register_pass<ngraph::pass::MemoryLayout>(get_memory_alignment());
pass_manager.register_pass<runtime::gpu::pass::TensorMemoryReservation>(
*allocator, m_tensor_memory_buffers);
string dump_filename = file_util::path_join(get_output_dir(), m_function_name + "_ops.txt");
pass_manager.register_pass<ngraph::pass::DumpSorted>(dump_filename);
pass_manager.run_passes(m_function);
m_function_ordered_ops.emplace(m_function, m_function->get_ordered_ops());
add_passes(pass_manager);
emit();
// allocate device buffers for primitive arguments and workspace
allocator->close();
m_shared_context->m_primitive_emitter->allocate_primitive_memory();
compile_function();
m_is_compiled = true;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <functional>
#include <memory>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include "ngraph/function.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_wrapper.hpp"
#define EMIT_ARGS \
runtime::gpu::GPUCompiledFunction *compiled_function, const std::string &function_name, \
const Node *node, const std::vector<runtime::gpu::GPUTensorWrapper> &args, \
const std::vector<runtime::gpu::GPUTensorWrapper> &out
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class GPU_Emitter;
struct GPURuntimeContext;
class GPUCompiledFunction
{
friend class GPU_Backend;
friend class GPU_Executable;
public:
GPUCompiledFunction(
const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
virtual ~GPUCompiledFunction();
static std::shared_ptr<GPUCompiledFunction>
make(const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
{
return m_shared_context->m_primitive_emitter;
}
virtual std::string
add_to_runtime(size_t primitive_index,
const std::string& function_name,
const std::vector<runtime::gpu::GPUTensorWrapper>& args,
const std::vector<runtime::gpu::GPUTensorWrapper>& out) = 0;
virtual std::string
add_call_to_runtime(const std::string& caller,
const std::string& callee,
const std::vector<runtime::gpu::GPUTensorWrapper>& args,
const std::vector<runtime::gpu::GPUTensorWrapper>& out) = 0;
void compile();
virtual void
get_performance_data(std::vector<runtime::PerformanceCounter>& rc) const = 0;
static size_t get_memory_alignment();
static std::string get_output_dir();
protected:
virtual void compile_function() = 0;
virtual void add_passes(ngraph::pass::Manager& pass_manager) = 0;
virtual void emit() = 0;
EntryPoint m_runtime;
// For non-destructive passthrough kernels, propagate function
// input buffers to internal ops
virtual void propagate_in_place_input(ngraph::descriptor::Output* output,
const std::string& input_name) = 0;
// For in-place kernels, propagate function output buffers to
// internal ops
virtual void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
const std::string& output_name) = 0;
std::shared_ptr<ngraph::Function> m_function;
std::unordered_map<std::shared_ptr<Function>, std::list<std::shared_ptr<Node>>>
m_function_ordered_ops;
bool m_emit_timing;
bool m_is_compiled;
size_t m_offset;
std::string m_function_name;
std::unordered_map<std::string, size_t> m_tensor_memory_buffers;
std::shared_ptr<GPU_Backend::BackendContext> m_shared_context;
};
}
}
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <memory>
#include <string>
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
using namespace ngraph;
runtime::gpu::CudaContextManager::CudaContextManager()
{
CUDA_SAFE_CALL(cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0));
CUDA_SAFE_CALL(cuDevicePrimaryCtxRetain(&m_context, m_device));
}
runtime::gpu::CudaContextManager::~CudaContextManager()
{
CUDA_SAFE_CALL_NO_THROW(cuDevicePrimaryCtxRelease(m_device));
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <cuda.h>
#include <memory>
#include <string>
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class CudaContextManager
{
public:
CudaContextManager();
~CudaContextManager();
CudaContextManager(CudaContextManager const&) = delete;
CudaContextManager(CudaContextManager&&) = delete;
CudaContextManager& operator=(CudaContextManager const&) = delete;
CudaContextManager& operator=(CudaContextManager&&) = delete;
CUcontext GetContext() { return m_context; }
void SetContextCurrent() { cuCtxSetCurrent(m_context); }
protected:
CUdevice m_device;
CUcontext m_context;
};
}
}
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <cstring>
#include <iostream>
#include <string>
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
using namespace ngraph;
std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::string& name,
const std::string& kernel,
int number_of_options,
const char** options)
{
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
kernel.c_str(),
"ngraph.cu",
0, // numHeaders
NULL, // headers
NULL)); // includeNames
nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);
// output compiler log helper
auto emit_log = [&prog]() {
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char* log = static_cast<char*>(malloc(sizeof(char) * logSize + 1));
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
log[logSize] = '\x0';
if (std::strlen(log) >= 2)
{
std::cerr << log;
}
free(log);
};
// throw if compilation was not successful
if (compile_result != NVRTC_SUCCESS)
{
std::cerr << "Compile error: \n" + kernel;
// output compiler errors
emit_log();
throw std::runtime_error("NVRTC compilation failure.");
}
// output any compiler warnings
emit_log();
// retrieve the intermediate PTX
size_t ptx_size;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
char* ptx = new char[ptx_size];
NVRTC_SAFE_CALL(
nvrtcGetPTX(prog,
ptx)); // Load the generated PTX and get a handle to the parent kernel.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
// extract the compiled function
CUmodule module;
CUfunction function;
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, nullptr, nullptr));
CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str()));
return std::make_shared<CUfunction>(function);
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <string>
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class CudaFunctionBuilder
{
public:
static std::shared_ptr<CUfunction> get(const std::string& name,
const std::string& kernel,
int number_of_options,
const char** options);
};
}
}
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <cctype>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
static const std::string s_output_dir = "gpu_codegen";
using namespace ngraph;
std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionPool::set(const std::string& name,
const std::string& kernel)
{
const char* opts[] = {"--gpu-architecture=compute_35", "--relocatable-device-code=true"};
std::string filename =
file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu");
std::ofstream out(filename);
out << kernel;
out.close();
auto cu_compiled_function = CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts);
m_function_map.insert({name, cu_compiled_function});
return cu_compiled_function;
}
std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionPool::get(const std::string& name)
{
auto it = m_function_map.find(name);
if (it != m_function_map.end())
{
return (*it).second;
}
return nullptr;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class CudaFunctionPool
{
public:
CudaFunctionPool() {}
~CudaFunctionPool() {}
std::shared_ptr<CUfunction> set(const std::string& name, const std::string& kernel);
std::shared_ptr<CUfunction> get(const std::string& name);
private:
std::unordered_map<std::string, std::shared_ptr<CUfunction>> m_function_map;
};
}
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <string>
#include <vector>
#include "ngraph/node.hpp"
#include "ngraph/runtime/gpu/gpu_compiled_function.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_wrapper.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class GPU_Emitter
{
public:
static std::function<std::string(EMIT_ARGS)> get_emit_function(const Node& node);
// This defines a collection of function declarations like this
// static std::string emit_Abs(EMIT_ARGS);
// static std::string emit_Acos(EMIT_ARGS);
#define NGRAPH_OP(a, b) static std::string emit_##a(EMIT_ARGS);
#include "ngraph/runtime/gpu/op/op_tbl.hpp"
#undef NGRAPH_OP
template <typename T>
static std::string emit_elementwise(EMIT_ARGS)
{
if (out[0].get_size() == 0)
{
return "";
}
else if (out.size() > 1)
{
throw std::runtime_error(
"Multi-output elementwise ops are not currently supported.");
}
auto& cuda_emitter =
compiled_function->get_primitive_emitter()->get_cuda_emitter();
std::vector<std::string> dtypes;
for (auto& arg : args)
{
dtypes.push_back(arg.get_type());
}
// Special case for bool data type.
if (out[0].get_element_type() == element::boolean)
{
dtypes.push_back("bool");
}
else
{
dtypes.push_back(out[0].get_type());
}
auto ew_index = cuda_emitter->build_elementwise<T>(dtypes, out[0].get_shape());
return compiled_function->add_to_runtime(ew_index, function_name, args, out);
}
static std::string emit_ArgReduce(EMIT_ARGS, cudnnReduceTensorOp_t);
static std::string emit_Sum_0(EMIT_ARGS);
static std::string emit_Sum_1(EMIT_ARGS);
/// \brief Create a list of node names for each arg in args
/// \param args list of tensor arguments
/// \param arg_indexes a list of indexes into args for which args to include in
/// the output list, so {1, 2} will include args 1 and 2 and skip 0.
/// \ return returns a string containing "arg0_name, arg1_name, etc."
static std::string node_names(const std::vector<GPUTensorWrapper>& args,
std::initializer_list<int> arg_indexes = {});
};
Shape get_padded_shape(const Shape& input_shape,
const Shape& padding_below,
const Shape& padding_above,
const Shape& padding_interior);
}
}
}
This diff is collapsed.
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#if !defined(NGRAPH_DEX_ONLY)
#include <functional>
#include <memory>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include "ngraph/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/function.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_compiled_function.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_wrapper.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class GPU_Emitter;
struct GPURuntimeContext;
class GPUExternalFunction : public GPUCompiledFunction
{
public:
GPUExternalFunction(
const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
virtual ~GPUExternalFunction();
virtual std::string
add_to_runtime(size_t primitive_index,
const std::string& function_name,
const std::vector<runtime::gpu::GPUTensorWrapper>& args,
const std::vector<runtime::gpu::GPUTensorWrapper>& out) override;
virtual std::string add_call_to_runtime(
const std::string& caller,
const std::string& callee,
const std::vector<runtime::gpu::GPUTensorWrapper>& args,
const std::vector<runtime::gpu::GPUTensorWrapper>& out) override;
virtual void get_performance_data(
std::vector<runtime::PerformanceCounter>& rc) const override;
protected:
virtual void compile_function() override;
virtual void add_passes(ngraph::pass::Manager& pass_manager) override;
virtual void emit() override;
private:
/// \brief Create a list of node names for each arg in args
/// \param args list of tensor arguments
/// \param arg_indexes a list of indexes into args for which args to include in
/// the output list, so {1, 2} will include args 1 and 2 and skip 0.
/// \ return returns a string containing "arg0_name, arg1_name, etc."
std::string node_names(const std::vector<runtime::gpu::GPUTensorWrapper>& args,
std::initializer_list<int> arg_indexes = {});
void emit_header();
void emit_timer_functions();
void emit_constant_declarations();
void emit_function_declarations();
void emit_functions();
void emit_debug_function_entry(Node* node);
void emit_debug_function_exit(Node* node);
void emit_temp_mem_pool_allocation(std::shared_ptr<Function> current_function);
void store_emitted_functions(const std::string& code);
std::string emit_op(EMIT_ARGS);
std::string emit_op_as_function(const Node& node, const std::string& function_name);
std::string strip_comments(const std::string& s) const;
static const std::string& get_pch_header_source();
static const std::string& get_header_source();
// For non-destructive passthrough kernels, propagate function
// input buffers to internal ops
virtual void propagate_in_place_input(ngraph::descriptor::Output* output,
const std::string& input_name) override;
// For in-place kernels, propagate function output buffers to
// internal ops
virtual void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
const std::string& output_name) override;
CodeWriter m_writer;
std::string m_common_function_string;
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
std::map<std::string, size_t> m_name_index_map;
std::unordered_map<std::string, std::string> m_variable_name_map;
std::unordered_map<Node*, Node*> m_node_function_map;
};
}
}
}
#endif // !defined(NGRAPH_DEX_ONLY)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
namespace ngraph
{
namespace runtime
{
namespace gpu
{
void example_kernel();
}
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment