Commit c48807c3 authored by Boris Fomitchev's avatar Boris Fomitchev Committed by Alexander Alekhin

Merge pull request #9418 from borisfom:cuda9

CUDA9 build fixed, added detection (#9418)

* CUDA9 build fixed, added detection

* Replacing deprecated __shfl_xxx with __shfl_sync, fixing bogus CUDA9 warnings
parent d0509f67
...@@ -195,8 +195,8 @@ OCV_OPTION(WITH_CPUFEATURES "Use cpufeatures Android library" ON ...@@ -195,8 +195,8 @@ OCV_OPTION(WITH_CPUFEATURES "Use cpufeatures Android library" ON
OCV_OPTION(WITH_VTK "Include VTK library support (and build opencv_viz module eiher)" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING) ) OCV_OPTION(WITH_VTK "Include VTK library support (and build opencv_viz module eiher)" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING) )
OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (NOT IOS AND NOT WINRT) ) OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (NOT IOS AND NOT WINRT) ) OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (NOT IOS AND NOT WINRT) ) OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" ON IF (NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (NOT IOS AND NOT APPLE) ) OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" ON IF (NOT IOS AND NOT APPLE) )
OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON IF (NOT WINRT) ) OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON IF (NOT WINRT) )
OCV_OPTION(WITH_VFW "Include Video for Windows support" ON IF WIN32 ) OCV_OPTION(WITH_VFW "Include Video for Windows support" ON IF WIN32 )
OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT) ) OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
......
...@@ -790,8 +790,18 @@ endif() ...@@ -790,8 +790,18 @@ endif()
if(CUDA_VERSION VERSION_GREATER "5.0") if(CUDA_VERSION VERSION_GREATER "5.0")
# In CUDA 5.5 NPP was splitted onto 3 separate libraries. # In CUDA 5.5 NPP was splitted onto 3 separate libraries.
find_cuda_helper_libs(nppc) find_cuda_helper_libs(nppc)
find_cuda_helper_libs(nppi) find_cuda_helper_libs(nppial)
find_cuda_helper_libs(nppicc)
find_cuda_helper_libs(nppicom)
find_cuda_helper_libs(nppidei)
find_cuda_helper_libs(nppif)
find_cuda_helper_libs(nppig)
find_cuda_helper_libs(nppim)
find_cuda_helper_libs(nppist)
find_cuda_helper_libs(nppisu)
find_cuda_helper_libs(nppitc)
find_cuda_helper_libs(npps) find_cuda_helper_libs(npps)
set(CUDA_nppi_LIBRARY "${CUDA_nppial_LIBRARY};${CUDA_nppicc_LIBRARY};${CUDA_nppicom_LIBRARY};${CUDA_nppidei_LIBRARY};${CUDA_nppif_LIBRARY};${CUDA_nppig_LIBRARY};${CUDA_nppim_LIBRARY};${CUDA_nppist_LIBRARY};${CUDA_nppisu_LIBRARY};${CUDA_nppitc_LIBRARY}")
set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}") set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}")
elseif(NOT CUDA_VERSION VERSION_LESS "4.0") elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
find_cuda_helper_libs(npp) find_cuda_helper_libs(npp)
......
...@@ -43,7 +43,7 @@ if(CUDA_FOUND) ...@@ -43,7 +43,7 @@ if(CUDA_FOUND)
message(STATUS "CUDA detected: " ${CUDA_VERSION}) message(STATUS "CUDA detected: " ${CUDA_VERSION})
set(_generations "Fermi" "Kepler" "Maxwell" "Pascal") set(_generations "Fermi" "Kepler" "Maxwell" "Pascal" "Volta")
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
list(APPEND _generations "Auto") list(APPEND _generations "Auto")
endif() endif()
...@@ -70,6 +70,8 @@ if(CUDA_FOUND) ...@@ -70,6 +70,8 @@ if(CUDA_FOUND)
set(__cuda_arch_bin "5.0 5.2") set(__cuda_arch_bin "5.0 5.2")
elseif(CUDA_GENERATION STREQUAL "Pascal") elseif(CUDA_GENERATION STREQUAL "Pascal")
set(__cuda_arch_bin "6.0 6.1") set(__cuda_arch_bin "6.0 6.1")
elseif(CUDA_GENERATION STREQUAL "Volta")
set(__cuda_arch_bin "7.0")
elseif(CUDA_GENERATION STREQUAL "Auto") elseif(CUDA_GENERATION STREQUAL "Auto")
execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run" execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run"
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/" WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
...@@ -94,17 +96,17 @@ if(CUDA_FOUND) ...@@ -94,17 +96,17 @@ if(CUDA_FOUND)
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT _nvcc_res EQUAL 0) if(NOT _nvcc_res EQUAL 0)
message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.") message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.")
set(__cuda_arch_bin "5.3 6.2") set(__cuda_arch_bin "5.3 6.2 7.0")
else() else()
set(__cuda_arch_bin "${_nvcc_out}") set(__cuda_arch_bin "${_nvcc_out}")
string(REPLACE "2.1" "2.1(2.0)" __cuda_arch_bin "${__cuda_arch_bin}") string(REPLACE "2.1" "2.1(2.0)" __cuda_arch_bin "${__cuda_arch_bin}")
endif() endif()
set(__cuda_arch_ptx "") set(__cuda_arch_ptx "")
else() else()
if(${CUDA_VERSION} VERSION_LESS "8.0") if(${CUDA_VERSION} VERSION_LESS "9.0")
set(__cuda_arch_bin "2.0 3.0 3.5 3.7 5.0 5.2")
else()
set(__cuda_arch_bin "2.0 3.0 3.5 3.7 5.0 5.2 6.0 6.1") set(__cuda_arch_bin "2.0 3.0 3.5 3.7 5.0 5.2 6.0 6.1")
else()
set(__cuda_arch_bin "3.0 3.5 3.7 5.0 5.2 6.0 6.1 7.0")
endif() endif()
endif() endif()
endif() endif()
......
...@@ -58,6 +58,14 @@ ...@@ -58,6 +58,14 @@
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
# include <cuda.h> # include <cuda.h>
# include <cuda_runtime.h> # include <cuda_runtime.h>
# if defined (__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wstrict-aliasing"
# include <cuda_fp16.h>
# pragma GCC diagnostic pop
# else
# include <cuda_fp16.h>
# endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
# include <npp.h> # include <npp.h>
# include "opencv2/core/cuda_stream_accessor.hpp" # include "opencv2/core/cuda_stream_accessor.hpp"
# include "opencv2/core/cuda/common.hpp" # include "opencv2/core/cuda/common.hpp"
......
...@@ -56,7 +56,7 @@ ...@@ -56,7 +56,7 @@
#include "opencv2/core/private.cuda.hpp" #include "opencv2/core/private.cuda.hpp"
#ifdef HAVE_NVCUVID #ifdef HAVE_NVCUVID
#include <nvcuvid.h> #include <dynlink_nvcuvid.h>
#ifdef _WIN32 #ifdef _WIN32
#define NOMINMAX #define NOMINMAX
......
...@@ -47,6 +47,7 @@ ...@@ -47,6 +47,7 @@
#define OPENCV_CUDEV_UTIL_SATURATE_CAST_HPP #define OPENCV_CUDEV_UTIL_SATURATE_CAST_HPP
#include "../common.hpp" #include "../common.hpp"
#include "opencv2/core/private.cuda.hpp"
namespace cv { namespace cudev { namespace cv { namespace cudev {
...@@ -274,12 +275,21 @@ template <typename T, typename D> __device__ __forceinline__ D cast_fp16(T v); ...@@ -274,12 +275,21 @@ template <typename T, typename D> __device__ __forceinline__ D cast_fp16(T v);
template <> __device__ __forceinline__ float cast_fp16<short, float>(short v) template <> __device__ __forceinline__ float cast_fp16<short, float>(short v)
{ {
#if __CUDACC_VER_MAJOR__ >= 9
return float(*(__half*)&v);
#else
return __half2float(v); return __half2float(v);
#endif
} }
template <> __device__ __forceinline__ short cast_fp16<float, short>(float v) template <> __device__ __forceinline__ short cast_fp16<float, short>(float v)
{ {
return (short)__float2half_rn(v); #if __CUDACC_VER_MAJOR__ >= 9
__half h(v);
return *(short*)&v;
#else
return (short)__float2half_rn(v);
#endif
} }
//! @} //! @}
......
...@@ -56,8 +56,14 @@ namespace cv { namespace cudev { ...@@ -56,8 +56,14 @@ namespace cv { namespace cudev {
#if CV_CUDEV_ARCH >= 300 #if CV_CUDEV_ARCH >= 300
// shfl #if __CUDACC_VER_MAJOR__ >= 9
# define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
#endif
// shfl
__device__ __forceinline__ uchar shfl(uchar val, int srcLane, int width = warpSize) __device__ __forceinline__ uchar shfl(uchar val, int srcLane, int width = warpSize)
{ {
return (uchar) __shfl((int) val, srcLane, width); return (uchar) __shfl((int) val, srcLane, width);
...@@ -419,6 +425,10 @@ CV_CUDEV_SHFL_XOR_VEC_INST(float) ...@@ -419,6 +425,10 @@ CV_CUDEV_SHFL_XOR_VEC_INST(float)
CV_CUDEV_SHFL_XOR_VEC_INST(double) CV_CUDEV_SHFL_XOR_VEC_INST(double)
#undef CV_CUDEV_SHFL_XOR_VEC_INST #undef CV_CUDEV_SHFL_XOR_VEC_INST
#undef __shfl
#undef __shfl_xor
#undef __shfl_up
#undef __shfl_down
#endif // CV_CUDEV_ARCH >= 300 #endif // CV_CUDEV_ARCH >= 300
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment