Commit 5a5c82bb authored by Alexander Smorkalov's avatar Alexander Smorkalov

Additional ENABLE_DYNAMIC_CUDA option implemented in cmake. Warning fixes and refactoring.

parent 037ffcdf
...@@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi ...@@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi
# OpenCV build options # OpenCV build options
# =================================================== # ===================================================
OCV_OPTION(ENABLE_DYNAMIC_CUDA "Enabled dynamic CUDA linkage" ON IF ANDROID OR LINUX)
OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers" ON IF (NOT IOS) ) OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers" ON IF (NOT IOS) )
OCV_OPTION(ENABLE_SOLUTION_FOLDERS "Solution folder in Visual Studio or in other IDEs" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") ) OCV_OPTION(ENABLE_SOLUTION_FOLDERS "Solution folder in Visual Studio or in other IDEs" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") )
OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF IF CMAKE_COMPILER_IS_GNUCXX ) OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF IF CMAKE_COMPILER_IS_GNUCXX )
......
set(the_description "The Core Functionality") set(the_description "The Core Functionality")
if (HAVE_opencv_dynamicuda) message(STATUS "ENABLE_DYNAMIC_CUDA ${ENABLE_DYNAMIC_CUDA}")
if (ENABLE_DYNAMIC_CUDA)
message(STATUS "Using dynamic cuda approach")
ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
else() else()
message(STATUS "Link CUDA statically")
ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
endif() endif()
...@@ -12,7 +16,7 @@ if(HAVE_WINRT) ...@@ -12,7 +16,7 @@ if(HAVE_WINRT)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
endif() endif()
if(HAVE_opencv_dynamicuda) if(ENABLE_DYNAMIC_CUDA)
add_definitions(-DDYNAMIC_CUDA_SUPPORT) add_definitions(-DDYNAMIC_CUDA_SUPPORT)
else() else()
add_definitions(-DUSE_CUDA) add_definitions(-DUSE_CUDA)
...@@ -26,18 +30,18 @@ endif() ...@@ -26,18 +30,18 @@ endif()
file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h")
file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
if (NOT HAVE_opencv_dynamicuda) if (NOT ENABLE_DYNAMIC_CUDA)
file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*") file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*")
endif() endif()
source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers" FILES ${lib_cuda_hdrs})
source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
if (NOT HAVE_opencv_dynamicuda) if (NOT ENABLE_DYNAMIC_CUDA)
source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs}) source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
endif() endif()
if (HAVE_opencv_dynamicuda) if (ENABLE_DYNAMIC_CUDA)
ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
else() else()
......
...@@ -5,6 +5,7 @@ endif() ...@@ -5,6 +5,7 @@ endif()
set(the_description "Dynamic CUDA linkage") set(the_description "Dynamic CUDA linkage")
add_definitions(-DUSE_CUDA) add_definitions(-DUSE_CUDA)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
set(OPENCV_MODULE_TYPE SHARED) set(OPENCV_MODULE_TYPE SHARED)
if (BUILD_FAT_JAVA_LIB) if (BUILD_FAT_JAVA_LIB)
......
#ifndef __GPUMAT_CUDA_HPP__ #ifndef __GPUMAT_CUDA_HPP__
#define __GPUMAT_CUDA_HPP__ #define __GPUMAT_CUDA_HPP__
class DeviceInfoFuncTable class DeviceInfoFuncTable
{ {
public: public:
// cv::DeviceInfo // cv::DeviceInfo
virtual size_t sharedMemPerBlock() const = 0; virtual size_t sharedMemPerBlock() const = 0;
virtual void queryMemory(size_t&, size_t&) const = 0; virtual void queryMemory(size_t&, size_t&) const = 0;
virtual size_t freeMemory() const = 0; virtual size_t freeMemory() const = 0;
virtual size_t totalMemory() const = 0; virtual size_t totalMemory() const = 0;
virtual bool supports(FeatureSet) const = 0; virtual bool supports(FeatureSet) const = 0;
virtual bool isCompatible() const = 0; virtual bool isCompatible() const = 0;
virtual void query() = 0; virtual void query() = 0;
virtual int deviceID() const = 0; virtual int deviceID() const = 0;
virtual std::string name() const = 0; virtual std::string name() const = 0;
virtual int majorVersion() const = 0; virtual int majorVersion() const = 0;
virtual int minorVersion() const = 0; virtual int minorVersion() const = 0;
virtual int multiProcessorCount() const = 0; virtual int multiProcessorCount() const = 0;
virtual int getCudaEnabledDeviceCount() const = 0; virtual int getCudaEnabledDeviceCount() const = 0;
virtual void setDevice(int) const = 0; virtual void setDevice(int) const = 0;
virtual int getDevice() const = 0; virtual int getDevice() const = 0;
virtual void resetDevice() const = 0; virtual void resetDevice() const = 0;
virtual bool deviceSupports(FeatureSet) const = 0; virtual bool deviceSupports(FeatureSet) const = 0;
// cv::TargetArchs // cv::TargetArchs
virtual bool builtWith(FeatureSet) const = 0; virtual bool builtWith(FeatureSet) const = 0;
virtual bool has(int, int) const = 0; virtual bool has(int, int) const = 0;
virtual bool hasPtx(int, int) const = 0; virtual bool hasPtx(int, int) const = 0;
virtual bool hasBin(int, int) const = 0; virtual bool hasBin(int, int) const = 0;
virtual bool hasEqualOrLessPtx(int, int) const = 0; virtual bool hasEqualOrLessPtx(int, int) const = 0;
virtual bool hasEqualOrGreater(int, int) const = 0; virtual bool hasEqualOrGreater(int, int) const = 0;
virtual bool hasEqualOrGreaterPtx(int, int) const = 0; virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
virtual bool hasEqualOrGreaterBin(int, int) const = 0; virtual bool hasEqualOrGreaterBin(int, int) const = 0;
virtual void printCudaDeviceInfo(int) const = 0; virtual void printCudaDeviceInfo(int) const = 0;
virtual void printShortCudaDeviceInfo(int) const = 0; virtual void printShortCudaDeviceInfo(int) const = 0;
virtual ~DeviceInfoFuncTable() {}; virtual ~DeviceInfoFuncTable() {};
}; };
class GpuFuncTable class GpuFuncTable
{ {
public: public:
virtual ~GpuFuncTable() {} virtual ~GpuFuncTable() {}
// GpuMat routines // GpuMat routines
virtual void copy(const Mat& src, GpuMat& dst) const = 0; virtual void copy(const Mat& src, GpuMat& dst) const = 0;
virtual void copy(const GpuMat& src, Mat& dst) const = 0; virtual void copy(const GpuMat& src, Mat& dst) const = 0;
virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
// gpu::device::convertTo funcs // gpu::device::convertTo funcs
virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0; virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
// for gpu::device::setTo funcs // for gpu::device::setTo funcs
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
virtual void free(void* devPtr) const = 0; virtual void free(void* devPtr) const = 0;
}; };
class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
{ {
public: public:
size_t sharedMemPerBlock() const { throw_nogpu; return 0; } size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
void queryMemory(size_t&, size_t&) const { throw_nogpu; } void queryMemory(size_t&, size_t&) const { throw_nogpu; }
size_t freeMemory() const { throw_nogpu; return 0; } size_t freeMemory() const { throw_nogpu; return 0; }
size_t totalMemory() const { throw_nogpu; return 0; } size_t totalMemory() const { throw_nogpu; return 0; }
bool supports(FeatureSet) const { throw_nogpu; return false; } bool supports(FeatureSet) const { throw_nogpu; return false; }
bool isCompatible() const { throw_nogpu; return false; } bool isCompatible() const { throw_nogpu; return false; }
void query() { throw_nogpu; } void query() { throw_nogpu; }
int deviceID() const { throw_nogpu; return -1; }; int deviceID() const { throw_nogpu; return -1; };
std::string name() const { throw_nogpu; return std::string(); } std::string name() const { throw_nogpu; return std::string(); }
int majorVersion() const { throw_nogpu; return -1; } int majorVersion() const { throw_nogpu; return -1; }
int minorVersion() const { throw_nogpu; return -1; } int minorVersion() const { throw_nogpu; return -1; }
int multiProcessorCount() const { throw_nogpu; return -1; } int multiProcessorCount() const { throw_nogpu; return -1; }
int getCudaEnabledDeviceCount() const { return 0; } int getCudaEnabledDeviceCount() const { return 0; }
void setDevice(int) const { throw_nogpu; } void setDevice(int) const { throw_nogpu; }
int getDevice() const { throw_nogpu; return 0; } int getDevice() const { throw_nogpu; return 0; }
void resetDevice() const { throw_nogpu; } void resetDevice() const { throw_nogpu; }
bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
bool builtWith(FeatureSet) const { throw_nogpu; return false; } bool builtWith(FeatureSet) const { throw_nogpu; return false; }
bool has(int, int) const { throw_nogpu; return false; } bool has(int, int) const { throw_nogpu; return false; }
bool hasPtx(int, int) const { throw_nogpu; return false; } bool hasPtx(int, int) const { throw_nogpu; return false; }
bool hasBin(int, int) const { throw_nogpu; return false; } bool hasBin(int, int) const { throw_nogpu; return false; }
bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; } bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
void printCudaDeviceInfo(int) const { throw_nogpu; } void printCudaDeviceInfo(int) const { throw_nogpu; }
void printShortCudaDeviceInfo(int) const { throw_nogpu; } void printShortCudaDeviceInfo(int) const { throw_nogpu; }
}; };
class EmptyFuncTable : public GpuFuncTable class EmptyFuncTable : public GpuFuncTable
{ {
public: public:
void copy(const Mat&, GpuMat&) const { throw_nogpu; } void copy(const Mat&, GpuMat&) const { throw_nogpu; }
void copy(const GpuMat&, Mat&) const { throw_nogpu; } void copy(const GpuMat&, Mat&) const { throw_nogpu; }
void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
void free(void*) const {} void free(void*) const {}
}; };
#if defined(USE_CUDA) #if defined(USE_CUDA)
...@@ -153,940 +153,949 @@ namespace cv { namespace gpu { namespace device ...@@ -153,940 +153,949 @@ namespace cv { namespace gpu { namespace device
void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
}}} }}}
template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
{ {
Scalar_<T> sf = s; Scalar_<T> sf = s;
cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
} }
template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
{ {
Scalar_<T> sf = s; Scalar_<T> sf = s;
cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
} }
template<int n> struct NPPTypeTraits; template<int n> struct NPPTypeTraits;
template<> struct NPPTypeTraits<CV_8U> { typedef Npp8u npp_type; }; template<> struct NPPTypeTraits<CV_8U> { typedef Npp8u npp_type; };
template<> struct NPPTypeTraits<CV_8S> { typedef Npp8s npp_type; }; template<> struct NPPTypeTraits<CV_8S> { typedef Npp8s npp_type; };
template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; }; template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; }; template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; }; template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; }; template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; }; template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Convert // Convert
template<int SDEPTH, int DDEPTH> struct NppConvertFunc template<int SDEPTH, int DDEPTH> struct NppConvertFunc
{ {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t; typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
}; };
template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH> template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
{ {
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t; typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
}; };
template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void call(const GpuMat& src, GpuMat& dst)
{ {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; NppiSize sz;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t; sz.width = src.cols;
sz.height = src.rows;
static void call(const GpuMat& src, GpuMat& dst) nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) ); cudaSafeCall( cudaDeviceSynchronize() );
}
};
cudaSafeCall( cudaDeviceSynchronize() ); template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
} {
}; typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func> static void call(const GpuMat& src, GpuMat& dst)
{ {
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t; NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
static void call(const GpuMat& src, GpuMat& dst) nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) ); cudaSafeCall( cudaDeviceSynchronize() );
}
};
cudaSafeCall( cudaDeviceSynchronize() ); //////////////////////////////////////////////////////////////////////////
} // Set
};
////////////////////////////////////////////////////////////////////////// template<int SDEPTH, int SCN> struct NppSetFunc
// Set {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
template<int SDEPTH, int SCN> struct NppSetFunc typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
{ };
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
}; };
template<int SDEPTH> struct NppSetFunc<SDEPTH, 1> template<int SCN> struct NppSetFunc<CV_8S, SCN>
{ {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<> struct NppSetFunc<CV_8S, 1>
{
typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
};
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
}; {
template<int SCN> struct NppSetFunc<CV_8S, SCN> typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
{
typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<> struct NppSetFunc<CV_8S, 1>
{
typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet static void call(GpuMat& src, Scalar s)
{ {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
static void call(GpuMat& src, Scalar s) Scalar_<src_t> nppS = s;
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s; nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) ); cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
cudaSafeCall( cudaDeviceSynchronize() ); static void call(GpuMat& src, Scalar s)
}
};
template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
{ {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
static void call(GpuMat& src, Scalar s) Scalar_<src_t> nppS = s;
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s; nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) ); cudaSafeCall( cudaDeviceSynchronize() );
}
};
cudaSafeCall( cudaDeviceSynchronize() ); template<int SDEPTH, int SCN> struct NppSetMaskFunc
} {
}; typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
template<int SDEPTH, int SCN> struct NppSetMaskFunc typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
{ };
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
}; };
template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
}; {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask static void call(GpuMat& src, Scalar s, const GpuMat& mask)
{ {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
static void call(GpuMat& src, Scalar s, const GpuMat& mask) Scalar_<src_t> nppS = s;
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s; nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) ); cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
cudaSafeCall( cudaDeviceSynchronize() ); static void call(GpuMat& src, Scalar s, const GpuMat& mask)
}
};
template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
{ {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
static void call(GpuMat& src, Scalar s, const GpuMat& mask) Scalar_<src_t> nppS = s;
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s; nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) ); cudaSafeCall( cudaDeviceSynchronize() );
}
};
cudaSafeCall( cudaDeviceSynchronize() ); //////////////////////////////////////////////////////////////////////////
} // CopyMasked
};
////////////////////////////////////////////////////////////////////////// template<int SDEPTH> struct NppCopyMaskedFunc
// CopyMasked {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
template<int SDEPTH> struct NppCopyMaskedFunc typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
{ };
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
}; {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
{ {
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t; NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) ); cudaSafeCall( cudaDeviceSynchronize() );
}
};
cudaSafeCall( cudaDeviceSynchronize() ); template <typename T> static inline bool isAligned(const T* ptr, size_t size)
} {
}; return reinterpret_cast<size_t>(ptr) % size == 0;
}
template <typename T> static inline bool isAligned(const T* ptr, size_t size) namespace cv { namespace gpu { namespace device
{
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0);
void convertTo(const GpuMat& src, GpuMat& dst);
void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0);
void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
void setTo(GpuMat& src, Scalar s);
void setTo(GpuMat& src, Scalar s, const GpuMat& mask);
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
{ {
return reinterpret_cast<size_t>(ptr) % size == 0; CV_Assert(src.size() == dst.size() && src.type() == dst.type());
CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
} }
namespace cv { namespace gpu { namespace device void convertTo(const GpuMat& src, GpuMat& dst)
{ {
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
{ }
CV_Assert(src.size() == dst.size() && src.type() == dst.type());
CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
} {
cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
}
void convertTo(const GpuMat& src, GpuMat& dst) void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
{ {
cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
}
void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) static const caller_t callers[] =
{ {
cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
} kernelSetCaller<float>, kernelSetCaller<double>
};
callers[src.depth()](src, s, stream);
}
void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
{
typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
void setTo(GpuMat& src, Scalar s, cudaStream_t stream) static const caller_t callers[] =
{ {
typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
kernelSetCaller<float>, kernelSetCaller<double>
};
static const caller_t callers[] = callers[src.depth()](src, s, mask, stream);
{ }
kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
kernelSetCaller<float>, kernelSetCaller<double>
};
callers[src.depth()](src, s, stream); void setTo(GpuMat& src, Scalar s)
} {
setTo(src, s, 0);
}
void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
{ {
typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); setTo(src, s, mask, 0);
}
}}}
static const caller_t callers[] = class CudaArch
{ {
kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>, public:
kernelSetCaller<float>, kernelSetCaller<double> CudaArch()
}; {
fromStr(CUDA_ARCH_BIN, bin);
fromStr(CUDA_ARCH_PTX, ptx);
fromStr(CUDA_ARCH_FEATURES, features);
}
callers[src.depth()](src, s, mask, stream); bool builtWith(FeatureSet feature_set) const
} {
return !features.empty() && (features.back() >= feature_set);
}
void setTo(GpuMat& src, Scalar s) bool hasPtx(int major, int minor) const
{ {
setTo(src, s, 0); return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
} }
void setTo(GpuMat& src, Scalar s, const GpuMat& mask) bool hasBin(int major, int minor) const
{ {
setTo(src, s, mask, 0); return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
} }
}}}
bool hasEqualOrLessPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
}
class CudaArch bool hasEqualOrGreaterPtx(int major, int minor) const
{ {
public: return !ptx.empty() && (ptx.back() >= major * 10 + minor);
CudaArch() }
{
fromStr(CUDA_ARCH_BIN, bin);
fromStr(CUDA_ARCH_PTX, ptx);
fromStr(CUDA_ARCH_FEATURES, features);
}
bool builtWith(FeatureSet feature_set) const bool hasEqualOrGreaterBin(int major, int minor) const
{ {
return !features.empty() && (features.back() >= feature_set); return !bin.empty() && (bin.back() >= major * 10 + minor);
} }
bool hasPtx(int major, int minor) const
{
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
}
bool hasBin(int major, int minor) const private:
{ void fromStr(const string& set_as_str, vector<int>& arr)
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); {
} if (set_as_str.find_first_not_of(" ") == string::npos)
return;
bool hasEqualOrLessPtx(int major, int minor) const istringstream stream(set_as_str);
int cur_value;
while (!stream.eof())
{ {
return !ptx.empty() && (ptx.front() <= major * 10 + minor); stream >> cur_value;
arr.push_back(cur_value);
} }
bool hasEqualOrGreaterPtx(int major, int minor) const sort(arr.begin(), arr.end());
}
vector<int> bin;
vector<int> ptx;
vector<int> features;
};
class DeviceProps
{
public:
DeviceProps()
{
props_.resize(10, 0);
}
~DeviceProps()
{
for (size_t i = 0; i < props_.size(); ++i)
{ {
return !ptx.empty() && (ptx.back() >= major * 10 + minor); if (props_[i])
delete props_[i];
} }
props_.clear();
}
cudaDeviceProp* get(int devID)
{
if (devID >= (int) props_.size())
props_.resize(devID + 5, 0);
bool hasEqualOrGreaterBin(int major, int minor) const if (!props_[devID])
{ {
return !bin.empty() && (bin.back() >= major * 10 + minor); props_[devID] = new cudaDeviceProp;
cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
} }
return props_[devID];
}
private:
std::vector<cudaDeviceProp*> props_;
};
private: DeviceProps deviceProps;
void fromStr(const string& set_as_str, vector<int>& arr)
{
if (set_as_str.find_first_not_of(" ") == string::npos)
return;
istringstream stream(set_as_str); class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
int cur_value; {
public:
size_t sharedMemPerBlock() const
{
return deviceProps.get(device_id_)->sharedMemPerBlock;
}
while (!stream.eof()) void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
{ {
stream >> cur_value; int prevDeviceID = getDevice();
arr.push_back(cur_value); if (prevDeviceID != device_id_)
} setDevice(device_id_);
sort(arr.begin(), arr.end()); cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
}
vector<int> bin; if (prevDeviceID != device_id_)
vector<int> ptx; setDevice(prevDeviceID);
vector<int> features; }
};
class DeviceProps size_t freeMemory() const
{ {
public: size_t _totalMemory, _freeMemory;
DeviceProps() queryMemory(_totalMemory, _freeMemory);
{ return _freeMemory;
props_.resize(10, 0); }
}
~DeviceProps() size_t totalMemory() const
{ {
for (size_t i = 0; i < props_.size(); ++i) size_t _totalMemory, _freeMemory;
{ queryMemory(_totalMemory, _freeMemory);
if (props_[i]) return _totalMemory;
delete props_[i]; }
}
props_.clear();
}
cudaDeviceProp* get(int devID) bool supports(FeatureSet feature_set) const
{ {
if (devID >= (int) props_.size()) int version = majorVersion_ * 10 + minorVersion_;
props_.resize(devID + 5, 0); return version >= feature_set;
}
if (!props_[devID]) bool isCompatible() const
{ {
props_[devID] = new cudaDeviceProp; // Check PTX compatibility
cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
} return true;
return props_[devID]; // Check BIN compatibility
} for (int i = minorVersion_; i >= 0; --i)
private: if (hasBin(majorVersion_, i))
std::vector<cudaDeviceProp*> props_; return true;
};
DeviceProps deviceProps; return false;
}
class CudaDeviceInfoFuncTable: DeviceInfoFuncTable void query()
{ {
public: const cudaDeviceProp* prop = deviceProps.get(device_id_);
size_t sharedMemPerBlock() const
{
return deviceProps.get(device_id_)->sharedMemPerBlock;
}
void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const name_ = prop->name;
{ multi_processor_count_ = prop->multiProcessorCount;
int prevDeviceID = getDevice(); majorVersion_ = prop->major;
if (prevDeviceID != device_id_) minorVersion_ = prop->minor;
setDevice(device_id_); }
cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); int deviceID() const
{
return device_id_;
}
if (prevDeviceID != device_id_) std::string name() const
setDevice(prevDeviceID); {
} return name_;
}
size_t freeMemory() const int majorVersion() const
{ {
size_t _totalMemory, _freeMemory; return majorVersion_;
queryMemory(_totalMemory, _freeMemory); }
return _freeMemory;
}
size_t totalMemory() const int minorVersion() const
{ {
size_t _totalMemory, _freeMemory; return minorVersion_;
queryMemory(_totalMemory, _freeMemory); }
return _totalMemory;
}
bool supports(FeatureSet feature_set) const int multiProcessorCount() const
{ {
int version = majorVersion_ * 10 + minorVersion_; return multi_processor_count_;
return version >= feature_set; }
}
bool isCompatible() const int getCudaEnabledDeviceCount() const
{ {
// Check PTX compatibility int count;
if (hasEqualOrLessPtx(majorVersion_, minorVersion_)) cudaError_t error = cudaGetDeviceCount( &count );
return true;
// Check BIN compatibility if (error == cudaErrorInsufficientDriver)
for (int i = minorVersion_; i >= 0; --i) return -1;
if (hasBin(majorVersion_, i))
return true;
return false; if (error == cudaErrorNoDevice)
} return 0;
void query() cudaSafeCall( error );
{ return count;
const cudaDeviceProp* prop = deviceProps.get(device_id_); }
name_ = prop->name; void setDevice(int device) const
multi_processor_count_ = prop->multiProcessorCount; {
majorVersion_ = prop->major; cudaSafeCall( cudaSetDevice( device ) );
minorVersion_ = prop->minor; }
}
int deviceID() const int getDevice() const
{ {
return device_id_; int device;
} cudaSafeCall( cudaGetDevice( &device ) );
return device;
}
std::string name() const void resetDevice() const
{ {
return name_; cudaSafeCall( cudaDeviceReset() );
} }
int majorVersion() const bool builtWith(FeatureSet feature_set) const
{ {
return majorVersion_; return cudaArch.builtWith(feature_set);
} }
int minorVersion() const bool has(int major, int minor) const
{ {
return minorVersion_; return hasPtx(major, minor) || hasBin(major, minor);
} }
int multiProcessorCount() const bool hasPtx(int major, int minor) const
{ {
return multi_processor_count_; return cudaArch.hasPtx(major, minor);
} }
int getCudaEnabledDeviceCount() const bool hasBin(int major, int minor) const
{ {
int count; return cudaArch.hasBin(major, minor);
cudaError_t error = cudaGetDeviceCount( &count ); }
if (error == cudaErrorInsufficientDriver) bool hasEqualOrLessPtx(int major, int minor) const
return -1; {
return cudaArch.hasEqualOrLessPtx(major, minor);
}
if (error == cudaErrorNoDevice) bool hasEqualOrGreater(int major, int minor) const
return 0; {
return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
}
cudaSafeCall( error ); bool hasEqualOrGreaterPtx(int major, int minor) const
return count; {
} return cudaArch.hasEqualOrGreaterPtx(major, minor);
}
void setDevice(int device) const bool hasEqualOrGreaterBin(int major, int minor) const
{ {
cudaSafeCall( cudaSetDevice( device ) ); return cudaArch.hasEqualOrGreaterBin(major, minor);
} }
int getDevice() const bool deviceSupports(FeatureSet feature_set) const
{
static int versions[] =
{ {
int device; -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
cudaSafeCall( cudaGetDevice( &device ) ); };
return device; static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
}
void resetDevice() const const int devId = getDevice();
{
cudaSafeCall( cudaDeviceReset() );
}
bool builtWith(FeatureSet feature_set) const int version;
{
return cudaArch.builtWith(feature_set);
}
bool has(int major, int minor) const if (devId < cache_size && versions[devId] >= 0)
version = versions[devId];
else
{ {
return hasPtx(major, minor) || hasBin(major, minor); DeviceInfo dev(devId);
version = dev.majorVersion() * 10 + dev.minorVersion();
if (devId < cache_size)
versions[devId] = version;
} }
bool hasPtx(int major, int minor) const return TargetArchs::builtWith(feature_set) && (version >= feature_set);
{ }
return cudaArch.hasPtx(major, minor);
}
bool hasBin(int major, int minor) const void printCudaDeviceInfo(int device) const
{ {
return cudaArch.hasBin(major, minor); int count = getCudaEnabledDeviceCount();
} bool valid = (device >= 0) && (device < count);
bool hasEqualOrLessPtx(int major, int minor) const int beg = valid ? device : 0;
{ int end = valid ? device+1 : count;
return cudaArch.hasEqualOrLessPtx(major, minor);
}
bool hasEqualOrGreater(int major, int minor) const printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
{ printf("Device count: %d\n", count);
return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
}
bool hasEqualOrGreaterPtx(int major, int minor) const int driverVersion = 0, runtimeVersion = 0;
{ cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
return cudaArch.hasEqualOrGreaterPtx(major, minor); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
}
bool hasEqualOrGreaterBin(int major, int minor) const const char *computeMode[] = {
{ "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
return cudaArch.hasEqualOrGreaterBin(major, minor); "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
} "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
"Unknown",
NULL
};
bool deviceSupports(FeatureSet feature_set) const for(int dev = beg; dev < end; ++dev)
{ {
static int versions[] = cudaDeviceProp prop;
{ cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
}; printf("\nDevice %d: \"%s\"\n", dev, prop.name);
static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0])); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor);
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
int cores = convertSMVer2Cores(prop.major, prop.minor);
if (cores > 0)
printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f);
printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n",
prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem);
printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock);
printf(" Total number of registers available per block: %d\n", prop.regsPerBlock);
printf(" Warp size: %d\n", prop.warpSize);
printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock);
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch);
printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment);
printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No");
printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No");
printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No");
printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No");
printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No");
printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No");
printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID );
printf(" Compute Mode:\n");
printf(" %s \n", computeMode[prop.computeMode]);
}
const int devId = getDevice(); printf("\n");
printf("deviceQuery, CUDA Driver = CUDART");
printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100);
printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
printf(", NumDevs = %d\n\n", count);
fflush(stdout);
}
int version; void printShortCudaDeviceInfo(int device) const
{
int count = getCudaEnabledDeviceCount();
bool valid = (device >= 0) && (device < count);
if (devId < cache_size && versions[devId] >= 0) int beg = valid ? device : 0;
version = versions[devId]; int end = valid ? device+1 : count;
else
{
DeviceInfo dev(devId);
version = dev.majorVersion() * 10 + dev.minorVersion();
if (devId < cache_size)
versions[devId] = version;
}
return TargetArchs::builtWith(feature_set) && (version >= feature_set); int driverVersion = 0, runtimeVersion = 0;
} cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
void printCudaDeviceInfo(int device) const for(int dev = beg; dev < end; ++dev)
{ {
int count = getCudaEnabledDeviceCount(); cudaDeviceProp prop;
bool valid = (device >= 0) && (device < count); cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
int beg = valid ? device : 0;
int end = valid ? device+1 : count;
printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
printf("Device count: %d\n", count);
int driverVersion = 0, runtimeVersion = 0;
cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
const char *computeMode[] = {
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
"Unknown",
NULL
};
for(int dev = beg; dev < end; ++dev) const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
{ printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
cudaDeviceProp prop; printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
printf("\nDevice %d: \"%s\"\n", dev, prop.name);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor);
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
int cores = convertSMVer2Cores(prop.major, prop.minor);
if (cores > 0)
printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f);
printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n",
prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem);
printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock);
printf(" Total number of registers available per block: %d\n", prop.regsPerBlock);
printf(" Warp size: %d\n", prop.warpSize);
printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock);
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch);
printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment);
printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No");
printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No");
printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No");
printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No");
printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No");
printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No");
printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID );
printf(" Compute Mode:\n");
printf(" %s \n", computeMode[prop.computeMode]);
}
printf("\n"); int cores = convertSMVer2Cores(prop.major, prop.minor);
printf("deviceQuery, CUDA Driver = CUDART"); if (cores > 0)
printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); printf(", %d cores", cores * prop.multiProcessorCount);
printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
printf(", NumDevs = %d\n\n", count);
fflush(stdout);
}
void printShortCudaDeviceInfo(int device) const printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
{ }
int count = getCudaEnabledDeviceCount(); fflush(stdout);
bool valid = (device >= 0) && (device < count); }
int beg = valid ? device : 0; private:
int end = valid ? device+1 : count; int device_id_;
int driverVersion = 0, runtimeVersion = 0; std::string name_;
cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); int multi_processor_count_;
cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); int majorVersion_;
int minorVersion_;
for(int dev = beg; dev < end; ++dev) const CudaArch cudaArch;
{
cudaDeviceProp prop;
cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; int convertSMVer2Cores(int major, int minor) const
printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); {
printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} SMtoCores;
int cores = convertSMVer2Cores(prop.major, prop.minor); SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } };
if (cores > 0)
printf(", %d cores", cores * prop.multiProcessorCount);
printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); int index = 0;
} while (gpuArchCoresPerSM[index].SM != -1)
fflush(stdout); {
if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
return gpuArchCoresPerSM[index].Cores;
index++;
} }
private: return -1;
int device_id_; }
};
std::string name_;
int multi_processor_count_;
int majorVersion_;
int minorVersion_;
const CudaArch cudaArch;
int convertSMVer2Cores(int major, int minor) const class CudaFuncTable : public GpuFuncTable
{ {
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM public:
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} SMtoCores;
SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; void copy(const Mat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
}
int index = 0; void copy(const GpuMat& src, Mat& dst) const
while (gpuArchCoresPerSM[index].SM != -1) {
{ cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) }
return gpuArchCoresPerSM[index].Cores;
index++;
}
return -1; void copy(const GpuMat& src, GpuMat& dst) const
} {
}; cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
}
class CudaFuncTable : public GpuFuncTable void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
{ {
public: CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
CV_Assert(src.size() == dst.size() && src.type() == dst.type());
CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
void copy(const Mat& src, GpuMat& dst) const if (src.depth() == CV_64F)
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
}
void copy(const GpuMat& src, Mat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
}
void copy(const GpuMat& src, GpuMat& dst) const
{ {
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
} }
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
static const func_t funcs[7][4] =
{ {
CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); /* 8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
CV_Assert(src.size() == dst.size() && src.type() == dst.type()); /* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask },
CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
/* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
/* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
/* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
/* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }
};
const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
func(src, dst, mask, 0);
}
if (src.depth() == CV_64F) void convert(const GpuMat& src, GpuMat& dst) const
{
typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
static const func_t funcs[7][7][4] =
{
{ {
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) /* 8U -> 8U */ {0, 0, 0, 0},
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); /* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
/* 8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
/* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }
},
{
/* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 8S */ {0,0,0,0},
/* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
},
{
/* 16U -> 8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
/* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16U -> 16U */ {0,0,0,0},
/* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }
},
{
/* 16S -> 8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
/* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16S -> 16S */ {0,0,0,0},
/* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }
},
{
/* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 32S */ {0,0,0,0},
/* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
},
{
/* 32F -> 8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 32F */ {0,0,0,0},
/* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
},
{
/* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 64F */ {0,0,0,0}
} }
};
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
static const func_t funcs[7][4] = CV_Assert(dst.depth() <= CV_64F);
{ CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
/* 8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
/* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask },
/* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
/* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
/* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
/* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
/* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }
};
const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask; if (src.depth() == CV_64F || dst.depth() == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
func(src, dst, mask, 0); bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
if (!aligned)
{
cv::gpu::device::convertTo(src, dst);
return;
} }
void convert(const GpuMat& src, GpuMat& dst) const const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
CV_DbgAssert(func != 0);
func(src, dst);
}
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
{
CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
CV_Assert(dst.depth() <= CV_64F);
if (src.depth() == CV_64F || dst.depth() == CV_64F)
{ {
typedef void (*func_t)(const GpuMat& src, GpuMat& dst); if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
static const func_t funcs[7][7][4] = CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
{ }
{
/* 8U -> 8U */ {0, 0, 0, 0},
/* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
/* 8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
/* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }
},
{
/* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 8S */ {0,0,0,0},
/* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
},
{
/* 16U -> 8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
/* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16U -> 16U */ {0,0,0,0},
/* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }
},
{
/* 16S -> 8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
/* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16S -> 16S */ {0,0,0,0},
/* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo },
/* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }
},
{
/* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 32S */ {0,0,0,0},
/* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
},
{
/* 32F -> 8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 32F -> 32F */ {0,0,0,0},
/* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
},
{
/* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
/* 64F -> 64F */ {0,0,0,0}
}
};
CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
CV_Assert(dst.depth() <= CV_64F); }
CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
if (src.depth() == CV_64F || dst.depth() == CV_64F) void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
{
if (mask.empty())
{
if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
{ {
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); return;
} }
bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); if (m.depth() == CV_8U)
if (!aligned)
{ {
cv::gpu::device::convertTo(src, dst); int cn = m.channels();
return;
if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
{
int val = saturate_cast<uchar>(s[0]);
cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
return;
}
} }
const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; typedef void (*func_t)(GpuMat& src, Scalar s);
CV_DbgAssert(func != 0); static const func_t funcs[7][4] =
{
{NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
{cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo },
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
{cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }
};
func(src, dst); CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
}
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const if (m.depth() == CV_64F)
{
CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
CV_Assert(dst.depth() <= CV_64F);
if (src.depth() == CV_64F || dst.depth() == CV_64F)
{ {
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
} }
cv::gpu::device::convertTo(src, dst, alpha, beta, stream); if (stream)
cv::gpu::device::setTo(m, s, stream);
else
funcs[m.depth()][m.channels() - 1](m, s);
} }
else
void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
{ {
if (mask.empty()) typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
static const func_t funcs[7][4] =
{ {
if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
{ {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo },
cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
return; {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
} {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
{cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }
};
if (m.depth() == CV_8U) CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
{
int cn = m.channels();
if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
{
int val = saturate_cast<uchar>(s[0]);
cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
return;
}
}
typedef void (*func_t)(GpuMat& src, Scalar s); if (m.depth() == CV_64F)
static const func_t funcs[7][4] =
{
{NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
{cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo },
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
{cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }
};
CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
if (m.depth() == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
if (stream)
cv::gpu::device::setTo(m, s, stream);
else
funcs[m.depth()][m.channels() - 1](m, s);
}
else
{ {
typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
static const func_t funcs[7][4] = CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
{
{NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
{cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo },
{NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
{NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
{NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
{cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }
};
CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
if (m.depth() == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
if (stream)
cv::gpu::device::setTo(m, s, mask, stream);
else
funcs[m.depth()][m.channels() - 1](m, s, mask);
} }
}
void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const if (stream)
{ cv::gpu::device::setTo(m, s, mask, stream);
cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); else
funcs[m.depth()][m.channels() - 1](m, s, mask);
} }
}
void free(void* devPtr) const void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
{ {
cudaFree(devPtr); cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
} }
};
void free(void* devPtr) const
{
cudaFree(devPtr);
}
};
#endif #endif
#endif #endif
\ No newline at end of file
...@@ -39,6 +39,9 @@ static EmptyFuncTable gpuTable; ...@@ -39,6 +39,9 @@ static EmptyFuncTable gpuTable;
extern "C" { extern "C" {
DeviceInfoFuncTable* deviceInfoFactory();
GpuFuncTable* gpuFactory();
DeviceInfoFuncTable* deviceInfoFactory() DeviceInfoFuncTable* deviceInfoFactory()
{ {
return (DeviceInfoFuncTable*)&deviceInfoTable; return (DeviceInfoFuncTable*)&deviceInfoTable;
......
...@@ -297,7 +297,7 @@ if(BUILD_FAT_JAVA_LIB) ...@@ -297,7 +297,7 @@ if(BUILD_FAT_JAVA_LIB)
list(REMOVE_ITEM __deps ${m}) list(REMOVE_ITEM __deps ${m})
endif() endif()
endforeach() endforeach()
if (HAVE_opencv_dynamicuda) if (ENABLE_DYNAMIC_CUDA)
list(REMOVE_ITEM __deps "opencv_dynamicuda") list(REMOVE_ITEM __deps "opencv_dynamicuda")
endif() endif()
if (ANDROID AND HAVE_opencv_gpu) if (ANDROID AND HAVE_opencv_gpu)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment