Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
d4087f19
Commit
d4087f19
authored
Dec 11, 2013
by
Alexander Smorkalov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
All CUDA related stuff were moved to separate dynamic library.
parent
e88253cc
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1203 additions
and
1071 deletions
+1203
-1071
CMakeLists.txt
modules/core/CMakeLists.txt
+16
-7
CMakeLists.txt
modules/core/cuda/CMakeLists.txt
+12
-0
main.cpp
modules/core/cuda/main.cpp
+24
-0
gpumat.hpp
modules/core/include/opencv2/core/gpumat.hpp
+2
-0
gpumat.cpp
modules/core/src/gpumat.cpp
+79
-1064
gpumat_cuda.hpp
modules/core/src/gpumat_cuda.hpp
+1070
-0
No files found.
modules/core/CMakeLists.txt
View file @
d4087f19
set
(
the_description
"The Core Functionality"
)
set
(
the_description
"The Core Functionality"
)
ocv_add_module
(
core PRIVATE_REQUIRED
${
ZLIB_LIBRARIES
}
)
ocv_module_include_directories
(
${
ZLIB_INCLUDE_DIR
}
)
if
(
HAVE_WINRT
)
if
(
HAVE_WINRT
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
/ZW /GS /Gm- /AI
\"
${
WINDOWS_SDK_PATH
}
/References/CommonConfiguration/Neutral
\"
/AI
\"
${
VISUAL_STUDIO_PATH
}
/vcpackages
\"
"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
/ZW /GS /Gm- /AI
\"
${
WINDOWS_SDK_PATH
}
/References/CommonConfiguration/Neutral
\"
/AI
\"
${
VISUAL_STUDIO_PATH
}
/vcpackages
\"
"
)
endif
()
endif
()
if
(
HAVE_CUDA
)
ocv_include_directories
(
"
${
OpenCV_SOURCE_DIR
}
/modules/gpu/include"
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS -Wundef
)
endif
()
file
(
GLOB lib_cuda_hdrs
"include/opencv2/
${
name
}
/cuda/*.hpp"
"include/opencv2/
${
name
}
/cuda/*.h"
)
file
(
GLOB lib_cuda_hdrs
"include/opencv2/
${
name
}
/cuda/*.hpp"
"include/opencv2/
${
name
}
/cuda/*.h"
)
file
(
GLOB lib_cuda_hdrs_detail
"include/opencv2/
${
name
}
/cuda/detail/*.hpp"
"include/opencv2/
${
name
}
/cuda/detail/*.h"
)
file
(
GLOB lib_cuda_hdrs_detail
"include/opencv2/
${
name
}
/cuda/detail/*.hpp"
"include/opencv2/
${
name
}
/cuda/detail/*.h"
)
source_group
(
"Cuda Headers"
FILES
${
lib_cuda_hdrs
}
)
source_group
(
"Cuda Headers"
FILES
${
lib_cuda_hdrs
}
)
source_group
(
"Cuda Headers
\\
Detail"
FILES
${
lib_cuda_hdrs_detail
}
)
source_group
(
"Cuda Headers
\\
Detail"
FILES
${
lib_cuda_hdrs_detail
}
)
if
(
DYNAMIC_CUDA_SUPPORT
)
add_definitions
(
-DDYNAMIC_CUDA_SUPPORT
)
endif
()
ocv_add_module
(
core PRIVATE_REQUIRED
${
ZLIB_LIBRARIES
}
)
ocv_module_include_directories
(
${
ZLIB_INCLUDE_DIR
}
)
if
(
HAVE_CUDA
)
ocv_include_directories
(
"
${
OpenCV_SOURCE_DIR
}
/modules/gpu/include"
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS -Wundef
)
endif
()
ocv_glob_module_sources
(
SOURCES
"
${
opencv_core_BINARY_DIR
}
/version_string.inc"
ocv_glob_module_sources
(
SOURCES
"
${
opencv_core_BINARY_DIR
}
/version_string.inc"
HEADERS
${
lib_cuda_hdrs
}
${
lib_cuda_hdrs_detail
}
)
HEADERS
${
lib_cuda_hdrs
}
${
lib_cuda_hdrs_detail
}
)
...
@@ -25,3 +30,7 @@ ocv_add_precompiled_headers(${the_module})
...
@@ -25,3 +30,7 @@ ocv_add_precompiled_headers(${the_module})
ocv_add_accuracy_tests
()
ocv_add_accuracy_tests
()
ocv_add_perf_tests
()
ocv_add_perf_tests
()
if
(
DYNAMIC_CUDA_SUPPORT
)
add_subdirectory
(
cuda
)
endif
()
modules/core/cuda/CMakeLists.txt
0 → 100644
View file @
d4087f19
project
(
opencv_core_cuda
)
set
(
HAVE_CUDA FALSE
)
add_definitions
(
"-DHAVE_CUDA"
)
include_directories
(
${
CUDA_INCLUDE_DIRS
}
"../src/"
"../include/opencv2/core/"
"
${
OpenCV_SOURCE_DIR
}
/modules/gpu/include"
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS -Wundef
)
cuda_add_library
(
opencv_core_cuda SHARED main.cpp ../src/cuda/matrix_operations.cu
)
target_link_libraries
(
opencv_core_cuda
${
CUDA_LIBRARIES
}
)
\ No newline at end of file
modules/core/cuda/main.cpp
0 → 100644
View file @
d4087f19
#include "opencv2/core/core.hpp"
#include "opencv2/core/gpumat.hpp"
#ifdef HAVE_CUDA
#include <cuda_runtime.h>
#include <npp.h>
#define CUDART_MINIMUM_REQUIRED_VERSION 4020
#define NPP_MINIMUM_REQUIRED_VERSION 4200
#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
#error "Insufficient Cuda Runtime library version, please update it."
#endif
#if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
#error "Insufficient NPP version, please update it."
#endif
#endif
using
namespace
cv
;
using
namespace
cv
::
gpu
;
#include "gpumat_cuda.hpp"
\ No newline at end of file
modules/core/include/opencv2/core/gpumat.hpp
View file @
d4087f19
...
@@ -48,6 +48,8 @@
...
@@ -48,6 +48,8 @@
#include "opencv2/core/core.hpp"
#include "opencv2/core/core.hpp"
#include "opencv2/core/cuda_devptrs.hpp"
#include "opencv2/core/cuda_devptrs.hpp"
#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
namespace
cv
{
namespace
gpu
namespace
cv
{
namespace
gpu
{
{
//////////////////////////////// Initialization & Info ////////////////////////
//////////////////////////////// Initialization & Info ////////////////////////
...
...
modules/core/src/gpumat.cpp
View file @
d4087f19
...
@@ -44,7 +44,7 @@
...
@@ -44,7 +44,7 @@
#include "opencv2/core/gpumat.hpp"
#include "opencv2/core/gpumat.hpp"
#include <iostream>
#include <iostream>
#if
def HAVE_CUDA
#if
defined(HAVE_CUDA)
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <npp.h>
...
@@ -64,489 +64,62 @@ using namespace std;
...
@@ -64,489 +64,62 @@ using namespace std;
using
namespace
cv
;
using
namespace
cv
;
using
namespace
cv
::
gpu
;
using
namespace
cv
::
gpu
;
#ifndef HAVE_CUDA
#include "gpumat_cuda.hpp"
#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
#else // HAVE_CUDA
namespace
namespace
{
{
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
const
GpuFuncTable
*
gpuFuncTable
()
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
inline
void
___cudaSafeCall
(
cudaError_t
err
,
const
char
*
file
,
const
int
line
,
const
char
*
func
=
""
)
{
if
(
cudaSuccess
!=
err
)
cv
::
gpu
::
error
(
cudaGetErrorString
(
err
),
file
,
line
,
func
);
}
inline
void
___nppSafeCall
(
int
err
,
const
char
*
file
,
const
int
line
,
const
char
*
func
=
""
)
{
{
if
(
err
<
0
)
static
EmptyFuncTable
funcTable
;
{
return
&
funcTable
;
std
::
ostringstream
msg
;
msg
<<
"NPP API Call Error: "
<<
err
;
cv
::
gpu
::
error
(
msg
.
str
().
c_str
(),
file
,
line
,
func
);
}
}
}
}
}
#endif // HAVE_CUDA
//////////////////////////////// Initialization & Info ////////////////////////
//////////////////////////////// Initialization & Info ////////////////////////
#ifndef HAVE_CUDA
int
cv
::
gpu
::
getCudaEnabledDeviceCount
()
{
return
gpuFuncTable
()
->
getCudaEnabledDeviceCount
();
}
int
cv
::
gpu
::
getCudaEnabledDeviceCount
()
{
return
0
;
}
void
cv
::
gpu
::
setDevice
(
int
)
{
throw_nogpu
;
}
int
cv
::
gpu
::
getDevice
()
{
throw_nogpu
;
return
0
;
}
void
cv
::
gpu
::
resetDevice
()
{
throw_nogpu
;
}
bool
cv
::
gpu
::
deviceSupports
(
FeatureSet
)
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
TargetArchs
::
builtWith
(
FeatureSet
)
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
TargetArchs
::
has
(
int
,
int
)
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
TargetArchs
::
hasPtx
(
int
,
int
)
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
TargetArchs
::
hasBin
(
int
,
int
)
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrLessPtx
(
int
,
int
)
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreater
(
int
,
int
)
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreaterPtx
(
int
,
int
)
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreaterBin
(
int
,
int
)
{
throw_nogpu
;
return
false
;
}
size_t
cv
::
gpu
::
DeviceInfo
::
sharedMemPerBlock
()
const
{
throw_nogpu
;
return
0
;
}
void
cv
::
gpu
::
DeviceInfo
::
queryMemory
(
size_t
&
,
size_t
&
)
const
{
throw_nogpu
;
}
size_t
cv
::
gpu
::
DeviceInfo
::
freeMemory
()
const
{
throw_nogpu
;
return
0
;
}
size_t
cv
::
gpu
::
DeviceInfo
::
totalMemory
()
const
{
throw_nogpu
;
return
0
;
}
bool
cv
::
gpu
::
DeviceInfo
::
supports
(
FeatureSet
)
const
{
throw_nogpu
;
return
false
;
}
bool
cv
::
gpu
::
DeviceInfo
::
isCompatible
()
const
{
throw_nogpu
;
return
false
;
}
void
cv
::
gpu
::
DeviceInfo
::
query
()
{
throw_nogpu
;
}
void
cv
::
gpu
::
printCudaDeviceInfo
(
int
)
{
throw_nogpu
;
}
void
cv
::
gpu
::
printShortCudaDeviceInfo
(
int
)
{
throw_nogpu
;
}
#else // HAVE_CUDA
int
cv
::
gpu
::
getCudaEnabledDeviceCount
()
{
int
count
;
cudaError_t
error
=
cudaGetDeviceCount
(
&
count
);
if
(
error
==
cudaErrorInsufficientDriver
)
return
-
1
;
if
(
error
==
cudaErrorNoDevice
)
return
0
;
cudaSafeCall
(
error
);
return
count
;
}
void
cv
::
gpu
::
setDevice
(
int
device
)
{
cudaSafeCall
(
cudaSetDevice
(
device
)
);
}
int
cv
::
gpu
::
getDevice
()
{
int
device
;
cudaSafeCall
(
cudaGetDevice
(
&
device
)
);
return
device
;
}
void
cv
::
gpu
::
resetDevice
()
{
cudaSafeCall
(
cudaDeviceReset
()
);
}
namespace
{
class
CudaArch
{
public
:
CudaArch
();
bool
builtWith
(
FeatureSet
feature_set
)
const
;
bool
hasPtx
(
int
major
,
int
minor
)
const
;
bool
hasBin
(
int
major
,
int
minor
)
const
;
bool
hasEqualOrLessPtx
(
int
major
,
int
minor
)
const
;
bool
hasEqualOrGreaterPtx
(
int
major
,
int
minor
)
const
;
bool
hasEqualOrGreaterBin
(
int
major
,
int
minor
)
const
;
private
:
static
void
fromStr
(
const
string
&
set_as_str
,
vector
<
int
>&
arr
);
vector
<
int
>
bin
;
vector
<
int
>
ptx
;
vector
<
int
>
features
;
};
const
CudaArch
cudaArch
;
CudaArch
::
CudaArch
()
{
fromStr
(
CUDA_ARCH_BIN
,
bin
);
fromStr
(
CUDA_ARCH_PTX
,
ptx
);
fromStr
(
CUDA_ARCH_FEATURES
,
features
);
}
bool
CudaArch
::
builtWith
(
FeatureSet
feature_set
)
const
{
return
!
features
.
empty
()
&&
(
features
.
back
()
>=
feature_set
);
}
bool
CudaArch
::
hasPtx
(
int
major
,
int
minor
)
const
{
return
find
(
ptx
.
begin
(),
ptx
.
end
(),
major
*
10
+
minor
)
!=
ptx
.
end
();
}
bool
CudaArch
::
hasBin
(
int
major
,
int
minor
)
const
{
return
find
(
bin
.
begin
(),
bin
.
end
(),
major
*
10
+
minor
)
!=
bin
.
end
();
}
bool
CudaArch
::
hasEqualOrLessPtx
(
int
major
,
int
minor
)
const
{
return
!
ptx
.
empty
()
&&
(
ptx
.
front
()
<=
major
*
10
+
minor
);
}
bool
CudaArch
::
hasEqualOrGreaterPtx
(
int
major
,
int
minor
)
const
{
return
!
ptx
.
empty
()
&&
(
ptx
.
back
()
>=
major
*
10
+
minor
);
}
bool
CudaArch
::
hasEqualOrGreaterBin
(
int
major
,
int
minor
)
const
{
return
!
bin
.
empty
()
&&
(
bin
.
back
()
>=
major
*
10
+
minor
);
}
void
CudaArch
::
fromStr
(
const
string
&
set_as_str
,
vector
<
int
>&
arr
)
{
if
(
set_as_str
.
find_first_not_of
(
" "
)
==
string
::
npos
)
return
;
istringstream
stream
(
set_as_str
);
int
cur_value
;
while
(
!
stream
.
eof
())
{
stream
>>
cur_value
;
arr
.
push_back
(
cur_value
);
}
sort
(
arr
.
begin
(),
arr
.
end
());
}
}
bool
cv
::
gpu
::
TargetArchs
::
builtWith
(
cv
::
gpu
::
FeatureSet
feature_set
)
{
return
cudaArch
.
builtWith
(
feature_set
);
}
bool
cv
::
gpu
::
TargetArchs
::
has
(
int
major
,
int
minor
)
{
return
hasPtx
(
major
,
minor
)
||
hasBin
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasPtx
(
int
major
,
int
minor
)
{
return
cudaArch
.
hasPtx
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasBin
(
int
major
,
int
minor
)
{
return
cudaArch
.
hasBin
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrLessPtx
(
int
major
,
int
minor
)
{
return
cudaArch
.
hasEqualOrLessPtx
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreater
(
int
major
,
int
minor
)
{
return
hasEqualOrGreaterPtx
(
major
,
minor
)
||
hasEqualOrGreaterBin
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreaterPtx
(
int
major
,
int
minor
)
{
return
cudaArch
.
hasEqualOrGreaterPtx
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreaterBin
(
int
major
,
int
minor
)
void
cv
::
gpu
::
setDevice
(
int
device
)
{
gpuFuncTable
()
->
setDevice
(
device
);
}
{
int
cv
::
gpu
::
getDevice
()
{
return
gpuFuncTable
()
->
getDevice
();
}
return
cudaArch
.
hasEqualOrGreaterBin
(
major
,
minor
);
}
bool
cv
::
gpu
::
deviceSupports
(
FeatureSet
feature_set
)
void
cv
::
gpu
::
resetDevice
()
{
gpuFuncTable
()
->
resetDevice
();
}
{
static
int
versions
[]
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
static
const
int
cache_size
=
static_cast
<
int
>
(
sizeof
(
versions
)
/
sizeof
(
versions
[
0
]));
const
int
devId
=
getDevice
();
bool
cv
::
gpu
::
deviceSupports
(
FeatureSet
feature_set
)
{
return
gpuFuncTable
()
->
deviceSupports
(
feature_set
);
}
int
version
;
bool
cv
::
gpu
::
TargetArchs
::
builtWith
(
FeatureSet
feature_set
)
{
return
gpuFuncTable
()
->
builtWith
(
feature_set
);
}
bool
cv
::
gpu
::
TargetArchs
::
has
(
int
major
,
int
minor
)
{
return
gpuFuncTable
()
->
has
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasPtx
(
int
major
,
int
minor
)
{
return
gpuFuncTable
()
->
hasPtx
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasBin
(
int
major
,
int
minor
)
{
return
gpuFuncTable
()
->
hasBin
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrLessPtx
(
int
major
,
int
minor
)
{
return
gpuFuncTable
()
->
hasEqualOrLessPtx
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreater
(
int
major
,
int
minor
)
{
return
gpuFuncTable
()
->
hasEqualOrGreater
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreaterPtx
(
int
major
,
int
minor
)
{
return
gpuFuncTable
()
->
hasEqualOrGreaterPtx
(
major
,
minor
);
}
bool
cv
::
gpu
::
TargetArchs
::
hasEqualOrGreaterBin
(
int
major
,
int
minor
)
{
return
gpuFuncTable
()
->
hasEqualOrGreaterBin
(
major
,
minor
);
}
if
(
devId
<
cache_size
&&
versions
[
devId
]
>=
0
)
size_t
cv
::
gpu
::
DeviceInfo
::
sharedMemPerBlock
()
const
{
return
gpuFuncTable
()
->
sharedMemPerBlock
();
}
version
=
versions
[
devId
];
void
cv
::
gpu
::
DeviceInfo
::
queryMemory
(
size_t
&
total_memory
,
size_t
&
free_memory
)
const
{
gpuFuncTable
()
->
queryMemory
(
total_memory
,
free_memory
);
}
else
size_t
cv
::
gpu
::
DeviceInfo
::
freeMemory
()
const
{
return
gpuFuncTable
()
->
freeMemory
();
}
{
size_t
cv
::
gpu
::
DeviceInfo
::
totalMemory
()
const
{
return
gpuFuncTable
()
->
totalMemory
();
}
DeviceInfo
dev
(
devId
);
bool
cv
::
gpu
::
DeviceInfo
::
supports
(
FeatureSet
feature_set
)
const
{
return
gpuFuncTable
()
->
supports
(
feature_set
);
}
version
=
dev
.
majorVersion
()
*
10
+
dev
.
minorVersion
();
bool
cv
::
gpu
::
DeviceInfo
::
isCompatible
()
const
{
return
gpuFuncTable
()
->
isCompatible
();
}
if
(
devId
<
cache_size
)
void
cv
::
gpu
::
DeviceInfo
::
query
()
{
gpuFuncTable
()
->
query
();
}
versions
[
devId
]
=
version
;
}
return
TargetArchs
::
builtWith
(
feature_set
)
&&
(
version
>=
feature_set
);
void
cv
::
gpu
::
printCudaDeviceInfo
(
int
device
)
{
gpuFuncTable
()
->
printCudaDeviceInfo
(
device
);
}
}
void
cv
::
gpu
::
printShortCudaDeviceInfo
(
int
device
)
{
gpuFuncTable
()
->
printShortCudaDeviceInfo
(
device
);
}
namespace
#ifdef HAVE_CUDA
{
class
DeviceProps
{
public
:
DeviceProps
();
~
DeviceProps
();
cudaDeviceProp
*
get
(
int
devID
);
private
:
std
::
vector
<
cudaDeviceProp
*>
props_
;
};
DeviceProps
::
DeviceProps
()
{
props_
.
resize
(
10
,
0
);
}
DeviceProps
::~
DeviceProps
()
{
for
(
size_t
i
=
0
;
i
<
props_
.
size
();
++
i
)
{
if
(
props_
[
i
])
delete
props_
[
i
];
}
props_
.
clear
();
}
cudaDeviceProp
*
DeviceProps
::
get
(
int
devID
)
{
if
(
devID
>=
(
int
)
props_
.
size
())
props_
.
resize
(
devID
+
5
,
0
);
if
(
!
props_
[
devID
])
{
props_
[
devID
]
=
new
cudaDeviceProp
;
cudaSafeCall
(
cudaGetDeviceProperties
(
props_
[
devID
],
devID
)
);
}
return
props_
[
devID
];
}
DeviceProps
deviceProps
;
}
size_t
cv
::
gpu
::
DeviceInfo
::
sharedMemPerBlock
()
const
{
return
deviceProps
.
get
(
device_id_
)
->
sharedMemPerBlock
;
}
void
cv
::
gpu
::
DeviceInfo
::
queryMemory
(
size_t
&
_totalMemory
,
size_t
&
_freeMemory
)
const
{
int
prevDeviceID
=
getDevice
();
if
(
prevDeviceID
!=
device_id_
)
setDevice
(
device_id_
);
cudaSafeCall
(
cudaMemGetInfo
(
&
_freeMemory
,
&
_totalMemory
)
);
if
(
prevDeviceID
!=
device_id_
)
setDevice
(
prevDeviceID
);
}
size_t
cv
::
gpu
::
DeviceInfo
::
freeMemory
()
const
{
size_t
_totalMemory
,
_freeMemory
;
queryMemory
(
_totalMemory
,
_freeMemory
);
return
_freeMemory
;
}
size_t
cv
::
gpu
::
DeviceInfo
::
totalMemory
()
const
{
size_t
_totalMemory
,
_freeMemory
;
queryMemory
(
_totalMemory
,
_freeMemory
);
return
_totalMemory
;
}
bool
cv
::
gpu
::
DeviceInfo
::
supports
(
FeatureSet
feature_set
)
const
{
int
version
=
majorVersion
()
*
10
+
minorVersion
();
return
version
>=
feature_set
;
}
bool
cv
::
gpu
::
DeviceInfo
::
isCompatible
()
const
{
// Check PTX compatibility
if
(
TargetArchs
::
hasEqualOrLessPtx
(
majorVersion
(),
minorVersion
()))
return
true
;
// Check BIN compatibility
for
(
int
i
=
minorVersion
();
i
>=
0
;
--
i
)
if
(
TargetArchs
::
hasBin
(
majorVersion
(),
i
))
return
true
;
return
false
;
}
void
cv
::
gpu
::
DeviceInfo
::
query
()
{
const
cudaDeviceProp
*
prop
=
deviceProps
.
get
(
device_id_
);
name_
=
prop
->
name
;
multi_processor_count_
=
prop
->
multiProcessorCount
;
majorVersion_
=
prop
->
major
;
minorVersion_
=
prop
->
minor
;
}
namespace
{
int
convertSMVer2Cores
(
int
major
,
int
minor
)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef
struct
{
int
SM
;
// 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int
Cores
;
}
SMtoCores
;
SMtoCores
gpuArchCoresPerSM
[]
=
{
{
0x10
,
8
},
{
0x11
,
8
},
{
0x12
,
8
},
{
0x13
,
8
},
{
0x20
,
32
},
{
0x21
,
48
},
{
0x30
,
192
},
{
0x35
,
192
},
{
-
1
,
-
1
}
};
int
index
=
0
;
while
(
gpuArchCoresPerSM
[
index
].
SM
!=
-
1
)
{
if
(
gpuArchCoresPerSM
[
index
].
SM
==
((
major
<<
4
)
+
minor
)
)
return
gpuArchCoresPerSM
[
index
].
Cores
;
index
++
;
}
return
-
1
;
}
}
void
cv
::
gpu
::
printCudaDeviceInfo
(
int
device
)
{
int
count
=
getCudaEnabledDeviceCount
();
bool
valid
=
(
device
>=
0
)
&&
(
device
<
count
);
int
beg
=
valid
?
device
:
0
;
int
end
=
valid
?
device
+
1
:
count
;
printf
(
"*** CUDA Device Query (Runtime API) version (CUDART static linking) ***
\n\n
"
);
printf
(
"Device count: %d
\n
"
,
count
);
int
driverVersion
=
0
,
runtimeVersion
=
0
;
cudaSafeCall
(
cudaDriverGetVersion
(
&
driverVersion
)
);
cudaSafeCall
(
cudaRuntimeGetVersion
(
&
runtimeVersion
)
);
const
char
*
computeMode
[]
=
{
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)"
,
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)"
,
"Prohibited (no host thread can use ::cudaSetDevice() with this device)"
,
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)"
,
"Unknown"
,
NULL
};
for
(
int
dev
=
beg
;
dev
<
end
;
++
dev
)
{
cudaDeviceProp
prop
;
cudaSafeCall
(
cudaGetDeviceProperties
(
&
prop
,
dev
)
);
printf
(
"
\n
Device %d:
\"
%s
\"\n
"
,
dev
,
prop
.
name
);
printf
(
" CUDA Driver Version / Runtime Version %d.%d / %d.%d
\n
"
,
driverVersion
/
1000
,
driverVersion
%
100
,
runtimeVersion
/
1000
,
runtimeVersion
%
100
);
printf
(
" CUDA Capability Major/Minor version number: %d.%d
\n
"
,
prop
.
major
,
prop
.
minor
);
printf
(
" Total amount of global memory: %.0f MBytes (%llu bytes)
\n
"
,
(
float
)
prop
.
totalGlobalMem
/
1048576.0
f
,
(
unsigned
long
long
)
prop
.
totalGlobalMem
);
int
cores
=
convertSMVer2Cores
(
prop
.
major
,
prop
.
minor
);
if
(
cores
>
0
)
printf
(
" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores
\n
"
,
prop
.
multiProcessorCount
,
cores
,
cores
*
prop
.
multiProcessorCount
);
printf
(
" GPU Clock Speed: %.2f GHz
\n
"
,
prop
.
clockRate
*
1e-6
f
);
printf
(
" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)
\n
"
,
prop
.
maxTexture1D
,
prop
.
maxTexture2D
[
0
],
prop
.
maxTexture2D
[
1
],
prop
.
maxTexture3D
[
0
],
prop
.
maxTexture3D
[
1
],
prop
.
maxTexture3D
[
2
]);
printf
(
" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d
\n
"
,
prop
.
maxTexture1DLayered
[
0
],
prop
.
maxTexture1DLayered
[
1
],
prop
.
maxTexture2DLayered
[
0
],
prop
.
maxTexture2DLayered
[
1
],
prop
.
maxTexture2DLayered
[
2
]);
printf
(
" Total amount of constant memory: %u bytes
\n
"
,
(
int
)
prop
.
totalConstMem
);
printf
(
" Total amount of shared memory per block: %u bytes
\n
"
,
(
int
)
prop
.
sharedMemPerBlock
);
printf
(
" Total number of registers available per block: %d
\n
"
,
prop
.
regsPerBlock
);
printf
(
" Warp size: %d
\n
"
,
prop
.
warpSize
);
printf
(
" Maximum number of threads per block: %d
\n
"
,
prop
.
maxThreadsPerBlock
);
printf
(
" Maximum sizes of each dimension of a block: %d x %d x %d
\n
"
,
prop
.
maxThreadsDim
[
0
],
prop
.
maxThreadsDim
[
1
],
prop
.
maxThreadsDim
[
2
]);
printf
(
" Maximum sizes of each dimension of a grid: %d x %d x %d
\n
"
,
prop
.
maxGridSize
[
0
],
prop
.
maxGridSize
[
1
],
prop
.
maxGridSize
[
2
]);
printf
(
" Maximum memory pitch: %u bytes
\n
"
,
(
int
)
prop
.
memPitch
);
printf
(
" Texture alignment: %u bytes
\n
"
,
(
int
)
prop
.
textureAlignment
);
printf
(
" Concurrent copy and execution: %s with %d copy engine(s)
\n
"
,
(
prop
.
deviceOverlap
?
"Yes"
:
"No"
),
prop
.
asyncEngineCount
);
printf
(
" Run time limit on kernels: %s
\n
"
,
prop
.
kernelExecTimeoutEnabled
?
"Yes"
:
"No"
);
printf
(
" Integrated GPU sharing Host Memory: %s
\n
"
,
prop
.
integrated
?
"Yes"
:
"No"
);
printf
(
" Support host page-locked memory mapping: %s
\n
"
,
prop
.
canMapHostMemory
?
"Yes"
:
"No"
);
printf
(
" Concurrent kernel execution: %s
\n
"
,
prop
.
concurrentKernels
?
"Yes"
:
"No"
);
printf
(
" Alignment requirement for Surfaces: %s
\n
"
,
prop
.
surfaceAlignment
?
"Yes"
:
"No"
);
printf
(
" Device has ECC support enabled: %s
\n
"
,
prop
.
ECCEnabled
?
"Yes"
:
"No"
);
printf
(
" Device is using TCC driver mode: %s
\n
"
,
prop
.
tccDriver
?
"Yes"
:
"No"
);
printf
(
" Device supports Unified Addressing (UVA): %s
\n
"
,
prop
.
unifiedAddressing
?
"Yes"
:
"No"
);
printf
(
" Device PCI Bus ID / PCI location ID: %d / %d
\n
"
,
prop
.
pciBusID
,
prop
.
pciDeviceID
);
printf
(
" Compute Mode:
\n
"
);
printf
(
" %s
\n
"
,
computeMode
[
prop
.
computeMode
]);
}
printf
(
"
\n
"
);
printf
(
"deviceQuery, CUDA Driver = CUDART"
);
printf
(
", CUDA Driver Version = %d.%d"
,
driverVersion
/
1000
,
driverVersion
%
100
);
printf
(
", CUDA Runtime Version = %d.%d"
,
runtimeVersion
/
1000
,
runtimeVersion
%
100
);
printf
(
", NumDevs = %d
\n\n
"
,
count
);
fflush
(
stdout
);
}
void
cv
::
gpu
::
printShortCudaDeviceInfo
(
int
device
)
namespace
cv
{
namespace
gpu
{
{
int
count
=
getCudaEnabledDeviceCount
();
CV_EXPORTS
void
copyWithMask
(
const
cv
::
gpu
::
GpuMat
&
,
cv
::
gpu
::
GpuMat
&
,
const
cv
::
gpu
::
GpuMat
&
,
cudaStream_t
);
bool
valid
=
(
device
>=
0
)
&&
(
device
<
count
);
CV_EXPORTS
void
convertTo
(
const
cv
::
gpu
::
GpuMat
&
,
cv
::
gpu
::
GpuMat
&
);
CV_EXPORTS
void
convertTo
(
const
cv
::
gpu
::
GpuMat
&
,
cv
::
gpu
::
GpuMat
&
,
double
,
double
,
cudaStream_t
=
0
);
int
beg
=
valid
?
device
:
0
;
CV_EXPORTS
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
cudaStream_t
);
int
end
=
valid
?
device
+
1
:
count
;
CV_EXPORTS
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
const
cv
::
gpu
::
GpuMat
&
,
cudaStream_t
);
CV_EXPORTS
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
);
int
driverVersion
=
0
,
runtimeVersion
=
0
;
CV_EXPORTS
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
const
cv
::
gpu
::
GpuMat
&
);
cudaSafeCall
(
cudaDriverGetVersion
(
&
driverVersion
)
);
}}
cudaSafeCall
(
cudaRuntimeGetVersion
(
&
runtimeVersion
)
);
for
(
int
dev
=
beg
;
dev
<
end
;
++
dev
)
{
cudaDeviceProp
prop
;
cudaSafeCall
(
cudaGetDeviceProperties
(
&
prop
,
dev
)
);
const
char
*
arch_str
=
prop
.
major
<
2
?
" (not Fermi)"
:
""
;
printf
(
"Device %d:
\"
%s
\"
%.0fMb"
,
dev
,
prop
.
name
,
(
float
)
prop
.
totalGlobalMem
/
1048576.0
f
);
printf
(
", sm_%d%d%s"
,
prop
.
major
,
prop
.
minor
,
arch_str
);
int
cores
=
convertSMVer2Cores
(
prop
.
major
,
prop
.
minor
);
if
(
cores
>
0
)
printf
(
", %d cores"
,
cores
*
prop
.
multiProcessorCount
);
printf
(
", Driver/Runtime ver.%d.%d/%d.%d
\n
"
,
driverVersion
/
1000
,
driverVersion
%
100
,
runtimeVersion
/
1000
,
runtimeVersion
%
100
);
}
fflush
(
stdout
);
}
#endif
// HAVE_CUDA
#endif
//////////////////////////////// GpuMat ///////////////////////////////
//////////////////////////////// GpuMat ///////////////////////////////
...
@@ -830,601 +403,6 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
...
@@ -830,601 +403,6 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
return
mat
=
GpuMat
(
rows
,
cols
,
type
);
return
mat
=
GpuMat
(
rows
,
cols
,
type
);
}
}
namespace
{
class
GpuFuncTable
{
public
:
virtual
~
GpuFuncTable
()
{}
virtual
void
copy
(
const
Mat
&
src
,
GpuMat
&
dst
)
const
=
0
;
virtual
void
copy
(
const
GpuMat
&
src
,
Mat
&
dst
)
const
=
0
;
virtual
void
copy
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
const
=
0
;
virtual
void
copyWithMask
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
)
const
=
0
;
virtual
void
convert
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
const
=
0
;
virtual
void
convert
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
alpha
,
double
beta
)
const
=
0
;
virtual
void
setTo
(
GpuMat
&
m
,
Scalar
s
,
const
GpuMat
&
mask
)
const
=
0
;
virtual
void
mallocPitch
(
void
**
devPtr
,
size_t
*
step
,
size_t
width
,
size_t
height
)
const
=
0
;
virtual
void
free
(
void
*
devPtr
)
const
=
0
;
};
}
#ifndef HAVE_CUDA
namespace
{
class
EmptyFuncTable
:
public
GpuFuncTable
{
public
:
void
copy
(
const
Mat
&
,
GpuMat
&
)
const
{
throw_nogpu
;
}
void
copy
(
const
GpuMat
&
,
Mat
&
)
const
{
throw_nogpu
;
}
void
copy
(
const
GpuMat
&
,
GpuMat
&
)
const
{
throw_nogpu
;
}
void
copyWithMask
(
const
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
)
const
{
throw_nogpu
;
}
void
convert
(
const
GpuMat
&
,
GpuMat
&
)
const
{
throw_nogpu
;
}
void
convert
(
const
GpuMat
&
,
GpuMat
&
,
double
,
double
)
const
{
throw_nogpu
;
}
void
setTo
(
GpuMat
&
,
Scalar
,
const
GpuMat
&
)
const
{
throw_nogpu
;
}
void
mallocPitch
(
void
**
,
size_t
*
,
size_t
,
size_t
)
const
{
throw_nogpu
;
}
void
free
(
void
*
)
const
{}
};
const
GpuFuncTable
*
gpuFuncTable
()
{
static
EmptyFuncTable
empty
;
return
&
empty
;
}
}
#else // HAVE_CUDA
namespace
cv
{
namespace
gpu
{
namespace
device
{
void
copyToWithMask_gpu
(
PtrStepSzb
src
,
PtrStepSzb
dst
,
size_t
elemSize1
,
int
cn
,
PtrStepSzb
mask
,
bool
colorMask
,
cudaStream_t
stream
);
template
<
typename
T
>
void
set_to_gpu
(
PtrStepSzb
mat
,
const
T
*
scalar
,
int
channels
,
cudaStream_t
stream
);
template
<
typename
T
>
void
set_to_gpu
(
PtrStepSzb
mat
,
const
T
*
scalar
,
PtrStepSzb
mask
,
int
channels
,
cudaStream_t
stream
);
void
convert_gpu
(
PtrStepSzb
src
,
int
sdepth
,
PtrStepSzb
dst
,
int
ddepth
,
double
alpha
,
double
beta
,
cudaStream_t
stream
);
}}}
namespace
{
template
<
typename
T
>
void
kernelSetCaller
(
GpuMat
&
src
,
Scalar
s
,
cudaStream_t
stream
)
{
Scalar_
<
T
>
sf
=
s
;
cv
::
gpu
::
device
::
set_to_gpu
(
src
,
sf
.
val
,
src
.
channels
(),
stream
);
}
template
<
typename
T
>
void
kernelSetCaller
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
Scalar_
<
T
>
sf
=
s
;
cv
::
gpu
::
device
::
set_to_gpu
(
src
,
sf
.
val
,
mask
,
src
.
channels
(),
stream
);
}
}
namespace
cv
{
namespace
gpu
{
CV_EXPORTS
void
copyWithMask
(
const
cv
::
gpu
::
GpuMat
&
,
cv
::
gpu
::
GpuMat
&
,
const
cv
::
gpu
::
GpuMat
&
,
CUstream_st
*
);
CV_EXPORTS
void
convertTo
(
const
cv
::
gpu
::
GpuMat
&
,
cv
::
gpu
::
GpuMat
&
);
CV_EXPORTS
void
convertTo
(
const
cv
::
gpu
::
GpuMat
&
,
cv
::
gpu
::
GpuMat
&
,
double
,
double
,
CUstream_st
*
);
CV_EXPORTS
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
CUstream_st
*
);
CV_EXPORTS
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
const
cv
::
gpu
::
GpuMat
&
,
CUstream_st
*
);
CV_EXPORTS
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
);
CV_EXPORTS
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
const
cv
::
gpu
::
GpuMat
&
);
}}
namespace
cv
{
namespace
gpu
{
void
copyWithMask
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
stream
=
0
)
{
CV_Assert
(
src
.
size
()
==
dst
.
size
()
&&
src
.
type
()
==
dst
.
type
());
CV_Assert
(
src
.
size
()
==
mask
.
size
()
&&
mask
.
depth
()
==
CV_8U
&&
(
mask
.
channels
()
==
1
||
mask
.
channels
()
==
src
.
channels
()));
cv
::
gpu
::
device
::
copyToWithMask_gpu
(
src
.
reshape
(
1
),
dst
.
reshape
(
1
),
src
.
elemSize1
(),
src
.
channels
(),
mask
.
reshape
(
1
),
mask
.
channels
()
!=
1
,
stream
);
}
void
convertTo
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
{
cv
::
gpu
::
device
::
convert_gpu
(
src
.
reshape
(
1
),
src
.
depth
(),
dst
.
reshape
(
1
),
dst
.
depth
(),
1.0
,
0.0
,
0
);
}
void
convertTo
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
alpha
,
double
beta
,
cudaStream_t
stream
=
0
)
{
cv
::
gpu
::
device
::
convert_gpu
(
src
.
reshape
(
1
),
src
.
depth
(),
dst
.
reshape
(
1
),
dst
.
depth
(),
alpha
,
beta
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
cudaStream_t
stream
)
{
typedef
void
(
*
caller_t
)(
GpuMat
&
src
,
Scalar
s
,
cudaStream_t
stream
);
static
const
caller_t
callers
[]
=
{
kernelSetCaller
<
uchar
>
,
kernelSetCaller
<
schar
>
,
kernelSetCaller
<
ushort
>
,
kernelSetCaller
<
short
>
,
kernelSetCaller
<
int
>
,
kernelSetCaller
<
float
>
,
kernelSetCaller
<
double
>
};
callers
[
src
.
depth
()](
src
,
s
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
typedef
void
(
*
caller_t
)(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
);
static
const
caller_t
callers
[]
=
{
kernelSetCaller
<
uchar
>
,
kernelSetCaller
<
schar
>
,
kernelSetCaller
<
ushort
>
,
kernelSetCaller
<
short
>
,
kernelSetCaller
<
int
>
,
kernelSetCaller
<
float
>
,
kernelSetCaller
<
double
>
};
callers
[
src
.
depth
()](
src
,
s
,
mask
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
)
{
setTo
(
src
,
s
,
0
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
)
{
setTo
(
src
,
s
,
mask
,
0
);
}
}}
namespace
{
template
<
int
n
>
struct
NPPTypeTraits
;
template
<>
struct
NPPTypeTraits
<
CV_8U
>
{
typedef
Npp8u
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_8S
>
{
typedef
Npp8s
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_16U
>
{
typedef
Npp16u
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_16S
>
{
typedef
Npp16s
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_32S
>
{
typedef
Npp32s
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_32F
>
{
typedef
Npp32f
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_64F
>
{
typedef
Npp64f
npp_type
;
};
//////////////////////////////////////////////////////////////////////////
// Convert
template
<
int
SDEPTH
,
int
DDEPTH
>
struct
NppConvertFunc
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
typename
NPPTypeTraits
<
DDEPTH
>::
npp_type
dst_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
src_t
*
pSrc
,
int
nSrcStep
,
dst_t
*
pDst
,
int
nDstStep
,
NppiSize
oSizeROI
);
};
template
<
int
DDEPTH
>
struct
NppConvertFunc
<
CV_32F
,
DDEPTH
>
{
typedef
typename
NPPTypeTraits
<
DDEPTH
>::
npp_type
dst_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
Npp32f
*
pSrc
,
int
nSrcStep
,
dst_t
*
pDst
,
int
nDstStep
,
NppiSize
oSizeROI
,
NppRoundMode
eRoundMode
);
};
template
<
int
SDEPTH
,
int
DDEPTH
,
typename
NppConvertFunc
<
SDEPTH
,
DDEPTH
>::
func_ptr
func
>
struct
NppCvt
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
typename
NPPTypeTraits
<
DDEPTH
>::
npp_type
dst_t
;
static
void
call
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
func
(
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
dst
.
ptr
<
dst_t
>
(),
static_cast
<
int
>
(
dst
.
step
),
sz
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
int
DDEPTH
,
typename
NppConvertFunc
<
CV_32F
,
DDEPTH
>::
func_ptr
func
>
struct
NppCvt
<
CV_32F
,
DDEPTH
,
func
>
{
typedef
typename
NPPTypeTraits
<
DDEPTH
>::
npp_type
dst_t
;
static
void
call
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
func
(
src
.
ptr
<
Npp32f
>
(),
static_cast
<
int
>
(
src
.
step
),
dst
.
ptr
<
dst_t
>
(),
static_cast
<
int
>
(
dst
.
step
),
sz
,
NPP_RND_NEAR
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
//////////////////////////////////////////////////////////////////////////
// Set
template
<
int
SDEPTH
,
int
SCN
>
struct
NppSetFunc
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
src_t
values
[],
src_t
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
);
};
template
<
int
SDEPTH
>
struct
NppSetFunc
<
SDEPTH
,
1
>
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
src_t
val
,
src_t
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
);
};
template
<
int
SCN
>
struct
NppSetFunc
<
CV_8S
,
SCN
>
{
typedef
NppStatus
(
*
func_ptr
)(
Npp8s
values
[],
Npp8s
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
);
};
template
<>
struct
NppSetFunc
<
CV_8S
,
1
>
{
typedef
NppStatus
(
*
func_ptr
)(
Npp8s
val
,
Npp8s
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
);
};
template
<
int
SDEPTH
,
int
SCN
,
typename
NppSetFunc
<
SDEPTH
,
SCN
>::
func_ptr
func
>
struct
NppSet
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
GpuMat
&
src
,
Scalar
s
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
Scalar_
<
src_t
>
nppS
=
s
;
nppSafeCall
(
func
(
nppS
.
val
,
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
sz
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
int
SDEPTH
,
typename
NppSetFunc
<
SDEPTH
,
1
>::
func_ptr
func
>
struct
NppSet
<
SDEPTH
,
1
,
func
>
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
GpuMat
&
src
,
Scalar
s
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
Scalar_
<
src_t
>
nppS
=
s
;
nppSafeCall
(
func
(
nppS
[
0
],
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
sz
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
int
SDEPTH
,
int
SCN
>
struct
NppSetMaskFunc
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
src_t
values
[],
src_t
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
,
const
Npp8u
*
pMask
,
int
nMaskStep
);
};
template
<
int
SDEPTH
>
struct
NppSetMaskFunc
<
SDEPTH
,
1
>
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
src_t
val
,
src_t
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
,
const
Npp8u
*
pMask
,
int
nMaskStep
);
};
template
<
int
SDEPTH
,
int
SCN
,
typename
NppSetMaskFunc
<
SDEPTH
,
SCN
>::
func_ptr
func
>
struct
NppSetMask
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
Scalar_
<
src_t
>
nppS
=
s
;
nppSafeCall
(
func
(
nppS
.
val
,
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
sz
,
mask
.
ptr
<
Npp8u
>
(),
static_cast
<
int
>
(
mask
.
step
))
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
int
SDEPTH
,
typename
NppSetMaskFunc
<
SDEPTH
,
1
>::
func_ptr
func
>
struct
NppSetMask
<
SDEPTH
,
1
,
func
>
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
Scalar_
<
src_t
>
nppS
=
s
;
nppSafeCall
(
func
(
nppS
[
0
],
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
sz
,
mask
.
ptr
<
Npp8u
>
(),
static_cast
<
int
>
(
mask
.
step
))
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
//////////////////////////////////////////////////////////////////////////
// CopyMasked
template
<
int
SDEPTH
>
struct
NppCopyMaskedFunc
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
src_t
*
pSrc
,
int
nSrcStep
,
src_t
*
pDst
,
int
nDstStep
,
NppiSize
oSizeROI
,
const
Npp8u
*
pMask
,
int
nMaskStep
);
};
template
<
int
SDEPTH
,
typename
NppCopyMaskedFunc
<
SDEPTH
>::
func_ptr
func
>
struct
NppCopyMasked
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
/*stream*/
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
func
(
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
dst
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
dst
.
step
),
sz
,
mask
.
ptr
<
Npp8u
>
(),
static_cast
<
int
>
(
mask
.
step
))
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
typename
T
>
static
inline
bool
isAligned
(
const
T
*
ptr
,
size_t
size
)
{
return
reinterpret_cast
<
size_t
>
(
ptr
)
%
size
==
0
;
}
//////////////////////////////////////////////////////////////////////////
// CudaFuncTable
class
CudaFuncTable
:
public
GpuFuncTable
{
public
:
void
copy
(
const
Mat
&
src
,
GpuMat
&
dst
)
const
{
cudaSafeCall
(
cudaMemcpy2D
(
dst
.
data
,
dst
.
step
,
src
.
data
,
src
.
step
,
src
.
cols
*
src
.
elemSize
(),
src
.
rows
,
cudaMemcpyHostToDevice
)
);
}
void
copy
(
const
GpuMat
&
src
,
Mat
&
dst
)
const
{
cudaSafeCall
(
cudaMemcpy2D
(
dst
.
data
,
dst
.
step
,
src
.
data
,
src
.
step
,
src
.
cols
*
src
.
elemSize
(),
src
.
rows
,
cudaMemcpyDeviceToHost
)
);
}
void
copy
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
const
{
cudaSafeCall
(
cudaMemcpy2D
(
dst
.
data
,
dst
.
step
,
src
.
data
,
src
.
step
,
src
.
cols
*
src
.
elemSize
(),
src
.
rows
,
cudaMemcpyDeviceToDevice
)
);
}
void
copyWithMask
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
)
const
{
CV_Assert
(
src
.
depth
()
<=
CV_64F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
src
.
size
()
==
dst
.
size
()
&&
src
.
type
()
==
dst
.
type
());
CV_Assert
(
src
.
size
()
==
mask
.
size
()
&&
mask
.
depth
()
==
CV_8U
&&
(
mask
.
channels
()
==
1
||
mask
.
channels
()
==
src
.
channels
()));
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
typedef
void
(
*
func_t
)(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
stream
);
static
const
func_t
funcs
[
7
][
4
]
=
{
/* 8U */
{
NppCopyMasked
<
CV_8U
,
nppiCopy_8u_C1MR
>::
call
,
cv
::
gpu
::
copyWithMask
,
NppCopyMasked
<
CV_8U
,
nppiCopy_8u_C3MR
>::
call
,
NppCopyMasked
<
CV_8U
,
nppiCopy_8u_C4MR
>::
call
},
/* 8S */
{
cv
::
gpu
::
copyWithMask
,
cv
::
gpu
::
copyWithMask
,
cv
::
gpu
::
copyWithMask
,
cv
::
gpu
::
copyWithMask
},
/* 16U */
{
NppCopyMasked
<
CV_16U
,
nppiCopy_16u_C1MR
>::
call
,
cv
::
gpu
::
copyWithMask
,
NppCopyMasked
<
CV_16U
,
nppiCopy_16u_C3MR
>::
call
,
NppCopyMasked
<
CV_16U
,
nppiCopy_16u_C4MR
>::
call
},
/* 16S */
{
NppCopyMasked
<
CV_16S
,
nppiCopy_16s_C1MR
>::
call
,
cv
::
gpu
::
copyWithMask
,
NppCopyMasked
<
CV_16S
,
nppiCopy_16s_C3MR
>::
call
,
NppCopyMasked
<
CV_16S
,
nppiCopy_16s_C4MR
>::
call
},
/* 32S */
{
NppCopyMasked
<
CV_32S
,
nppiCopy_32s_C1MR
>::
call
,
cv
::
gpu
::
copyWithMask
,
NppCopyMasked
<
CV_32S
,
nppiCopy_32s_C3MR
>::
call
,
NppCopyMasked
<
CV_32S
,
nppiCopy_32s_C4MR
>::
call
},
/* 32F */
{
NppCopyMasked
<
CV_32F
,
nppiCopy_32f_C1MR
>::
call
,
cv
::
gpu
::
copyWithMask
,
NppCopyMasked
<
CV_32F
,
nppiCopy_32f_C3MR
>::
call
,
NppCopyMasked
<
CV_32F
,
nppiCopy_32f_C4MR
>::
call
},
/* 64F */
{
cv
::
gpu
::
copyWithMask
,
cv
::
gpu
::
copyWithMask
,
cv
::
gpu
::
copyWithMask
,
cv
::
gpu
::
copyWithMask
}
};
const
func_t
func
=
mask
.
channels
()
==
src
.
channels
()
?
funcs
[
src
.
depth
()][
src
.
channels
()
-
1
]
:
cv
::
gpu
::
copyWithMask
;
func
(
src
,
dst
,
mask
,
0
);
}
void
convert
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
const
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src
,
GpuMat
&
dst
);
static
const
func_t
funcs
[
7
][
7
][
4
]
=
{
{
/* 8U -> 8U */
{
0
,
0
,
0
,
0
},
/* 8U -> 8S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 8U -> 16U */
{
NppCvt
<
CV_8U
,
CV_16U
,
nppiConvert_8u16u_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
NppCvt
<
CV_8U
,
CV_16U
,
nppiConvert_8u16u_C4R
>::
call
},
/* 8U -> 16S */
{
NppCvt
<
CV_8U
,
CV_16S
,
nppiConvert_8u16s_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
NppCvt
<
CV_8U
,
CV_16S
,
nppiConvert_8u16s_C4R
>::
call
},
/* 8U -> 32S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 8U -> 32F */
{
NppCvt
<
CV_8U
,
CV_32F
,
nppiConvert_8u32f_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 8U -> 64F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
}
},
{
/* 8S -> 8U */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 8S -> 8S */
{
0
,
0
,
0
,
0
},
/* 8S -> 16U */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 8S -> 16S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 8S -> 32S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 8S -> 32F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 8S -> 64F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
}
},
{
/* 16U -> 8U */
{
NppCvt
<
CV_16U
,
CV_8U
,
nppiConvert_16u8u_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
NppCvt
<
CV_16U
,
CV_8U
,
nppiConvert_16u8u_C4R
>::
call
},
/* 16U -> 8S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 16U -> 16U */
{
0
,
0
,
0
,
0
},
/* 16U -> 16S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 16U -> 32S */
{
NppCvt
<
CV_16U
,
CV_32S
,
nppiConvert_16u32s_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 16U -> 32F */
{
NppCvt
<
CV_16U
,
CV_32F
,
nppiConvert_16u32f_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 16U -> 64F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
}
},
{
/* 16S -> 8U */
{
NppCvt
<
CV_16S
,
CV_8U
,
nppiConvert_16s8u_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
NppCvt
<
CV_16S
,
CV_8U
,
nppiConvert_16s8u_C4R
>::
call
},
/* 16S -> 8S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 16S -> 16U */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 16S -> 16S */
{
0
,
0
,
0
,
0
},
/* 16S -> 32S */
{
NppCvt
<
CV_16S
,
CV_32S
,
nppiConvert_16s32s_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 16S -> 32F */
{
NppCvt
<
CV_16S
,
CV_32F
,
nppiConvert_16s32f_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 16S -> 64F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
}
},
{
/* 32S -> 8U */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32S -> 8S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32S -> 16U */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32S -> 16S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32S -> 32S */
{
0
,
0
,
0
,
0
},
/* 32S -> 32F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32S -> 64F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
}
},
{
/* 32F -> 8U */
{
NppCvt
<
CV_32F
,
CV_8U
,
nppiConvert_32f8u_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32F -> 8S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32F -> 16U */
{
NppCvt
<
CV_32F
,
CV_16U
,
nppiConvert_32f16u_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32F -> 16S */
{
NppCvt
<
CV_32F
,
CV_16S
,
nppiConvert_32f16s_C1R
>::
call
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32F -> 32S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 32F -> 32F */
{
0
,
0
,
0
,
0
},
/* 32F -> 64F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
}
},
{
/* 64F -> 8U */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 64F -> 8S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 64F -> 16U */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 64F -> 16S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 64F -> 32S */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 64F -> 32F */
{
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
,
cv
::
gpu
::
convertTo
},
/* 64F -> 64F */
{
0
,
0
,
0
,
0
}
}
};
CV_Assert
(
src
.
depth
()
<=
CV_64F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
dst
.
depth
()
<=
CV_64F
);
CV_Assert
(
src
.
size
()
==
dst
.
size
()
&&
src
.
channels
()
==
dst
.
channels
());
if
(
src
.
depth
()
==
CV_64F
||
dst
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
bool
aligned
=
isAligned
(
src
.
data
,
16
)
&&
isAligned
(
dst
.
data
,
16
);
if
(
!
aligned
)
{
cv
::
gpu
::
convertTo
(
src
,
dst
);
return
;
}
const
func_t
func
=
funcs
[
src
.
depth
()][
dst
.
depth
()][
src
.
channels
()
-
1
];
CV_DbgAssert
(
func
!=
0
);
func
(
src
,
dst
);
}
void
convert
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
alpha
,
double
beta
)
const
{
CV_Assert
(
src
.
depth
()
<=
CV_64F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
dst
.
depth
()
<=
CV_64F
);
if
(
src
.
depth
()
==
CV_64F
||
dst
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
cv
::
gpu
::
convertTo
(
src
,
dst
,
alpha
,
beta
);
}
void
setTo
(
GpuMat
&
m
,
Scalar
s
,
const
GpuMat
&
mask
)
const
{
if
(
mask
.
empty
())
{
if
(
s
[
0
]
==
0.0
&&
s
[
1
]
==
0.0
&&
s
[
2
]
==
0.0
&&
s
[
3
]
==
0.0
)
{
cudaSafeCall
(
cudaMemset2D
(
m
.
data
,
m
.
step
,
0
,
m
.
cols
*
m
.
elemSize
(),
m
.
rows
)
);
return
;
}
if
(
m
.
depth
()
==
CV_8U
)
{
int
cn
=
m
.
channels
();
if
(
cn
==
1
||
(
cn
==
2
&&
s
[
0
]
==
s
[
1
])
||
(
cn
==
3
&&
s
[
0
]
==
s
[
1
]
&&
s
[
0
]
==
s
[
2
])
||
(
cn
==
4
&&
s
[
0
]
==
s
[
1
]
&&
s
[
0
]
==
s
[
2
]
&&
s
[
0
]
==
s
[
3
]))
{
int
val
=
saturate_cast
<
uchar
>
(
s
[
0
]);
cudaSafeCall
(
cudaMemset2D
(
m
.
data
,
m
.
step
,
val
,
m
.
cols
*
m
.
elemSize
(),
m
.
rows
)
);
return
;
}
}
typedef
void
(
*
func_t
)(
GpuMat
&
src
,
Scalar
s
);
static
const
func_t
funcs
[
7
][
4
]
=
{
{
NppSet
<
CV_8U
,
1
,
nppiSet_8u_C1R
>::
call
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
NppSet
<
CV_8U
,
4
,
nppiSet_8u_C4R
>::
call
},
{
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
},
{
NppSet
<
CV_16U
,
1
,
nppiSet_16u_C1R
>::
call
,
NppSet
<
CV_16U
,
2
,
nppiSet_16u_C2R
>::
call
,
cv
::
gpu
::
setTo
,
NppSet
<
CV_16U
,
4
,
nppiSet_16u_C4R
>::
call
},
{
NppSet
<
CV_16S
,
1
,
nppiSet_16s_C1R
>::
call
,
NppSet
<
CV_16S
,
2
,
nppiSet_16s_C2R
>::
call
,
cv
::
gpu
::
setTo
,
NppSet
<
CV_16S
,
4
,
nppiSet_16s_C4R
>::
call
},
{
NppSet
<
CV_32S
,
1
,
nppiSet_32s_C1R
>::
call
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
NppSet
<
CV_32S
,
4
,
nppiSet_32s_C4R
>::
call
},
{
NppSet
<
CV_32F
,
1
,
nppiSet_32f_C1R
>::
call
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
NppSet
<
CV_32F
,
4
,
nppiSet_32f_C4R
>::
call
},
{
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
}
};
CV_Assert
(
m
.
depth
()
<=
CV_64F
&&
m
.
channels
()
<=
4
);
if
(
m
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
funcs
[
m
.
depth
()][
m
.
channels
()
-
1
](
m
,
s
);
}
else
{
typedef
void
(
*
func_t
)(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
);
static
const
func_t
funcs
[
7
][
4
]
=
{
{
NppSetMask
<
CV_8U
,
1
,
nppiSet_8u_C1MR
>::
call
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
NppSetMask
<
CV_8U
,
4
,
nppiSet_8u_C4MR
>::
call
},
{
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
},
{
NppSetMask
<
CV_16U
,
1
,
nppiSet_16u_C1MR
>::
call
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
NppSetMask
<
CV_16U
,
4
,
nppiSet_16u_C4MR
>::
call
},
{
NppSetMask
<
CV_16S
,
1
,
nppiSet_16s_C1MR
>::
call
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
NppSetMask
<
CV_16S
,
4
,
nppiSet_16s_C4MR
>::
call
},
{
NppSetMask
<
CV_32S
,
1
,
nppiSet_32s_C1MR
>::
call
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
NppSetMask
<
CV_32S
,
4
,
nppiSet_32s_C4MR
>::
call
},
{
NppSetMask
<
CV_32F
,
1
,
nppiSet_32f_C1MR
>::
call
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
NppSetMask
<
CV_32F
,
4
,
nppiSet_32f_C4MR
>::
call
},
{
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
,
cv
::
gpu
::
setTo
}
};
CV_Assert
(
m
.
depth
()
<=
CV_64F
&&
m
.
channels
()
<=
4
);
if
(
m
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
funcs
[
m
.
depth
()][
m
.
channels
()
-
1
](
m
,
s
,
mask
);
}
}
void
mallocPitch
(
void
**
devPtr
,
size_t
*
step
,
size_t
width
,
size_t
height
)
const
{
cudaSafeCall
(
cudaMallocPitch
(
devPtr
,
step
,
width
,
height
)
);
}
void
free
(
void
*
devPtr
)
const
{
cudaFree
(
devPtr
);
}
};
const
GpuFuncTable
*
gpuFuncTable
()
{
static
CudaFuncTable
funcTable
;
return
&
funcTable
;
}
}
#endif // HAVE_CUDA
void
cv
::
gpu
::
GpuMat
::
upload
(
const
Mat
&
m
)
void
cv
::
gpu
::
GpuMat
::
upload
(
const
Mat
&
m
)
{
{
CV_DbgAssert
(
!
m
.
empty
());
CV_DbgAssert
(
!
m
.
empty
());
...
@@ -1492,9 +470,9 @@ void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double bet
...
@@ -1492,9 +470,9 @@ void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double bet
dst
.
create
(
size
(),
rtype
);
dst
.
create
(
size
(),
rtype
);
if
(
noScale
)
if
(
noScale
)
gpuFuncTable
()
->
convert
(
*
psrc
,
dst
);
cv
::
gpu
::
convertTo
(
*
psrc
,
dst
);
else
else
gpuFuncTable
()
->
convert
(
*
psrc
,
dst
,
alpha
,
beta
);
cv
::
gpu
::
convertTo
(
*
psrc
,
dst
,
alpha
,
beta
);
}
}
GpuMat
&
cv
::
gpu
::
GpuMat
::
setTo
(
Scalar
s
,
const
GpuMat
&
mask
)
GpuMat
&
cv
::
gpu
::
GpuMat
::
setTo
(
Scalar
s
,
const
GpuMat
&
mask
)
...
@@ -1502,7 +480,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
...
@@ -1502,7 +480,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
CV_Assert
(
mask
.
empty
()
||
mask
.
type
()
==
CV_8UC1
);
CV_Assert
(
mask
.
empty
()
||
mask
.
type
()
==
CV_8UC1
);
CV_DbgAssert
(
!
empty
());
CV_DbgAssert
(
!
empty
());
gpu
FuncTable
()
->
setTo
(
*
this
,
s
,
mask
);
gpu
::
setTo
(
*
this
,
s
,
mask
);
return
*
this
;
return
*
this
;
}
}
...
@@ -1562,6 +540,43 @@ void cv::gpu::GpuMat::release()
...
@@ -1562,6 +540,43 @@ void cv::gpu::GpuMat::release()
refcount
=
0
;
refcount
=
0
;
}
}
#ifdef HAVE_CUDA
namespace
cv
{
namespace
gpu
{
void
convertTo
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
{
gpuFuncTable
()
->
convert
(
src
,
dst
);
}
void
convertTo
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
alpha
,
double
beta
,
cudaStream_t
stream
)
{
gpuFuncTable
()
->
convert
(
src
,
dst
,
alpha
,
beta
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
cudaStream_t
stream
)
{
gpuFuncTable
()
->
setTo
(
src
,
s
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
gpuFuncTable
()
->
setTo
(
src
,
s
,
mask
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
)
{
setTo
(
src
,
s
,
0
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
)
{
setTo
(
src
,
s
,
mask
,
0
);
}
}}
#endif
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// Error handling
// Error handling
...
@@ -1578,5 +593,5 @@ void cv::gpu::error(const char *error_string, const char *file, const int line,
...
@@ -1578,5 +593,5 @@ void cv::gpu::error(const char *error_string, const char *file, const int line,
cerr
.
flush
();
cerr
.
flush
();
}
}
else
else
cv
::
error
(
cv
::
Exception
(
code
,
error_string
,
func
,
file
,
line
)
);
::
cv
::
error
(
::
cv
::
Exception
(
code
,
error_string
,
func
,
file
,
line
)
);
}
}
modules/core/src/gpumat_cuda.hpp
0 → 100644
View file @
d4087f19
namespace
{
#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT)
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
inline
void
___cudaSafeCall
(
cudaError_t
err
,
const
char
*
file
,
const
int
line
,
const
char
*
func
=
""
)
{
if
(
cudaSuccess
!=
err
)
cv
::
gpu
::
error
(
cudaGetErrorString
(
err
),
file
,
line
,
func
);
}
inline
void
___nppSafeCall
(
int
err
,
const
char
*
file
,
const
int
line
,
const
char
*
func
=
""
)
{
if
(
err
<
0
)
{
std
::
ostringstream
msg
;
msg
<<
"NPP API Call Error: "
<<
err
;
cv
::
gpu
::
error
(
msg
.
str
().
c_str
(),
file
,
line
,
func
);
}
}
#endif
}
namespace
{
class
GpuFuncTable
{
public
:
virtual
~
GpuFuncTable
()
{}
// DeviceInfo routines
virtual
int
getCudaEnabledDeviceCount
()
const
=
0
;
virtual
void
setDevice
(
int
)
const
=
0
;
virtual
int
getDevice
()
const
=
0
;
virtual
void
resetDevice
()
const
=
0
;
virtual
bool
deviceSupports
(
FeatureSet
)
const
=
0
;
virtual
bool
builtWith
(
FeatureSet
)
const
=
0
;
virtual
bool
has
(
int
,
int
)
const
=
0
;
virtual
bool
hasPtx
(
int
,
int
)
const
=
0
;
virtual
bool
hasBin
(
int
,
int
)
const
=
0
;
virtual
bool
hasEqualOrLessPtx
(
int
,
int
)
const
=
0
;
virtual
bool
hasEqualOrGreater
(
int
,
int
)
const
=
0
;
virtual
bool
hasEqualOrGreaterPtx
(
int
,
int
)
const
=
0
;
virtual
bool
hasEqualOrGreaterBin
(
int
,
int
)
const
=
0
;
virtual
size_t
sharedMemPerBlock
()
const
=
0
;
virtual
void
queryMemory
(
size_t
&
,
size_t
&
)
const
=
0
;
virtual
size_t
freeMemory
()
const
=
0
;
virtual
size_t
totalMemory
()
const
=
0
;
virtual
bool
supports
(
FeatureSet
)
const
=
0
;
virtual
bool
isCompatible
()
const
=
0
;
virtual
void
query
()
const
=
0
;
virtual
void
printCudaDeviceInfo
(
int
)
const
=
0
;
virtual
void
printShortCudaDeviceInfo
(
int
)
const
=
0
;
// GpuMat routines
virtual
void
copy
(
const
Mat
&
src
,
GpuMat
&
dst
)
const
=
0
;
virtual
void
copy
(
const
GpuMat
&
src
,
Mat
&
dst
)
const
=
0
;
virtual
void
copy
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
const
=
0
;
virtual
void
copyWithMask
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
)
const
=
0
;
// gpu::device::convertTo funcs
virtual
void
convert
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
alpha
,
double
beta
,
cudaStream_t
stream
=
0
)
const
=
0
;
virtual
void
convert
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
const
=
0
;
// for gpu::device::setTo funcs
virtual
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
CUstream_st
*
)
const
=
0
;
virtual
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
const
cv
::
gpu
::
GpuMat
&
,
CUstream_st
*
)
const
=
0
;
virtual
void
mallocPitch
(
void
**
devPtr
,
size_t
*
step
,
size_t
width
,
size_t
height
)
const
=
0
;
virtual
void
free
(
void
*
devPtr
)
const
=
0
;
};
}
#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
namespace
{
class
EmptyFuncTable
:
public
GpuFuncTable
{
public
:
// DeviceInfo routines
int
getCudaEnabledDeviceCount
()
const
{
return
0
;
}
void
setDevice
(
int
)
const
{
throw_nogpu
;
}
int
getDevice
()
const
{
throw_nogpu
;
return
0
;
}
void
resetDevice
()
const
{
throw_nogpu
;
}
bool
deviceSupports
(
FeatureSet
)
const
{
throw_nogpu
;
return
false
;
}
bool
builtWith
(
FeatureSet
)
const
{
throw_nogpu
;
return
false
;
}
bool
has
(
int
,
int
)
const
{
throw_nogpu
;
return
false
;
}
bool
hasPtx
(
int
,
int
)
const
{
throw_nogpu
;
return
false
;
}
bool
hasBin
(
int
,
int
)
const
{
throw_nogpu
;
return
false
;
}
bool
hasEqualOrLessPtx
(
int
,
int
)
const
{
throw_nogpu
;
return
false
;
}
bool
hasEqualOrGreater
(
int
,
int
)
const
{
throw_nogpu
;
return
false
;
}
bool
hasEqualOrGreaterPtx
(
int
,
int
)
const
{
throw_nogpu
;
return
false
;
}
bool
hasEqualOrGreaterBin
(
int
,
int
)
const
{
throw_nogpu
;
return
false
;
}
size_t
sharedMemPerBlock
()
const
{
throw_nogpu
;
return
0
;
}
void
queryMemory
(
size_t
&
,
size_t
&
)
const
{
throw_nogpu
;
}
size_t
freeMemory
()
const
{
throw_nogpu
;
return
0
;
}
size_t
totalMemory
()
const
{
throw_nogpu
;
return
0
;
}
bool
supports
(
FeatureSet
)
const
{
throw_nogpu
;
return
false
;
}
bool
isCompatible
()
const
{
throw_nogpu
;
return
false
;
}
void
query
()
const
{
throw_nogpu
;
}
void
printCudaDeviceInfo
(
int
)
const
{
throw_nogpu
;
}
void
printShortCudaDeviceInfo
(
int
)
const
{
throw_nogpu
;
}
void
copy
(
const
Mat
&
,
GpuMat
&
)
const
{
throw_nogpu
;
}
void
copy
(
const
GpuMat
&
,
Mat
&
)
const
{
throw_nogpu
;
}
void
copy
(
const
GpuMat
&
,
GpuMat
&
)
const
{
throw_nogpu
;
}
void
copyWithMask
(
const
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
)
const
{
throw_nogpu
;
}
void
convert
(
const
GpuMat
&
,
GpuMat
&
)
const
{
throw_nogpu
;
}
void
convert
(
const
GpuMat
&
,
GpuMat
&
,
double
,
double
,
cudaStream_t
stream
=
0
)
const
{
(
void
)
stream
;
throw_nogpu
;
}
virtual
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
CUstream_st
*
)
const
{
throw_nogpu
;
}
virtual
void
setTo
(
cv
::
gpu
::
GpuMat
&
,
cv
::
Scalar
,
const
cv
::
gpu
::
GpuMat
&
,
CUstream_st
*
)
const
{
throw_nogpu
;
}
void
mallocPitch
(
void
**
,
size_t
*
,
size_t
,
size_t
)
const
{
throw_nogpu
;
}
void
free
(
void
*
)
const
{}
};
}
#else
namespace
cv
{
namespace
gpu
{
namespace
device
{
void
copyToWithMask_gpu
(
PtrStepSzb
src
,
PtrStepSzb
dst
,
size_t
elemSize1
,
int
cn
,
PtrStepSzb
mask
,
bool
colorMask
,
cudaStream_t
stream
);
template
<
typename
T
>
void
set_to_gpu
(
PtrStepSzb
mat
,
const
T
*
scalar
,
int
channels
,
cudaStream_t
stream
);
template
<
typename
T
>
void
set_to_gpu
(
PtrStepSzb
mat
,
const
T
*
scalar
,
PtrStepSzb
mask
,
int
channels
,
cudaStream_t
stream
);
void
convert_gpu
(
PtrStepSzb
src
,
int
sdepth
,
PtrStepSzb
dst
,
int
ddepth
,
double
alpha
,
double
beta
,
cudaStream_t
stream
);
}}}
namespace
{
template
<
typename
T
>
void
kernelSetCaller
(
GpuMat
&
src
,
Scalar
s
,
cudaStream_t
stream
)
{
Scalar_
<
T
>
sf
=
s
;
cv
::
gpu
::
device
::
set_to_gpu
(
src
,
sf
.
val
,
src
.
channels
(),
stream
);
}
template
<
typename
T
>
void
kernelSetCaller
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
Scalar_
<
T
>
sf
=
s
;
cv
::
gpu
::
device
::
set_to_gpu
(
src
,
sf
.
val
,
mask
,
src
.
channels
(),
stream
);
}
}
namespace
{
template
<
int
n
>
struct
NPPTypeTraits
;
template
<>
struct
NPPTypeTraits
<
CV_8U
>
{
typedef
Npp8u
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_8S
>
{
typedef
Npp8s
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_16U
>
{
typedef
Npp16u
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_16S
>
{
typedef
Npp16s
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_32S
>
{
typedef
Npp32s
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_32F
>
{
typedef
Npp32f
npp_type
;
};
template
<>
struct
NPPTypeTraits
<
CV_64F
>
{
typedef
Npp64f
npp_type
;
};
//////////////////////////////////////////////////////////////////////////
// Convert
template
<
int
SDEPTH
,
int
DDEPTH
>
struct
NppConvertFunc
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
typename
NPPTypeTraits
<
DDEPTH
>::
npp_type
dst_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
src_t
*
pSrc
,
int
nSrcStep
,
dst_t
*
pDst
,
int
nDstStep
,
NppiSize
oSizeROI
);
};
template
<
int
DDEPTH
>
struct
NppConvertFunc
<
CV_32F
,
DDEPTH
>
{
typedef
typename
NPPTypeTraits
<
DDEPTH
>::
npp_type
dst_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
Npp32f
*
pSrc
,
int
nSrcStep
,
dst_t
*
pDst
,
int
nDstStep
,
NppiSize
oSizeROI
,
NppRoundMode
eRoundMode
);
};
template
<
int
SDEPTH
,
int
DDEPTH
,
typename
NppConvertFunc
<
SDEPTH
,
DDEPTH
>::
func_ptr
func
>
struct
NppCvt
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
typename
NPPTypeTraits
<
DDEPTH
>::
npp_type
dst_t
;
static
void
call
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
func
(
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
dst
.
ptr
<
dst_t
>
(),
static_cast
<
int
>
(
dst
.
step
),
sz
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
int
DDEPTH
,
typename
NppConvertFunc
<
CV_32F
,
DDEPTH
>::
func_ptr
func
>
struct
NppCvt
<
CV_32F
,
DDEPTH
,
func
>
{
typedef
typename
NPPTypeTraits
<
DDEPTH
>::
npp_type
dst_t
;
static
void
call
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
func
(
src
.
ptr
<
Npp32f
>
(),
static_cast
<
int
>
(
src
.
step
),
dst
.
ptr
<
dst_t
>
(),
static_cast
<
int
>
(
dst
.
step
),
sz
,
NPP_RND_NEAR
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
//////////////////////////////////////////////////////////////////////////
// Set
template
<
int
SDEPTH
,
int
SCN
>
struct
NppSetFunc
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
src_t
values
[],
src_t
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
);
};
template
<
int
SDEPTH
>
struct
NppSetFunc
<
SDEPTH
,
1
>
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
src_t
val
,
src_t
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
);
};
template
<
int
SCN
>
struct
NppSetFunc
<
CV_8S
,
SCN
>
{
typedef
NppStatus
(
*
func_ptr
)(
Npp8s
values
[],
Npp8s
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
);
};
template
<>
struct
NppSetFunc
<
CV_8S
,
1
>
{
typedef
NppStatus
(
*
func_ptr
)(
Npp8s
val
,
Npp8s
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
);
};
template
<
int
SDEPTH
,
int
SCN
,
typename
NppSetFunc
<
SDEPTH
,
SCN
>::
func_ptr
func
>
struct
NppSet
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
GpuMat
&
src
,
Scalar
s
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
Scalar_
<
src_t
>
nppS
=
s
;
nppSafeCall
(
func
(
nppS
.
val
,
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
sz
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
int
SDEPTH
,
typename
NppSetFunc
<
SDEPTH
,
1
>::
func_ptr
func
>
struct
NppSet
<
SDEPTH
,
1
,
func
>
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
GpuMat
&
src
,
Scalar
s
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
Scalar_
<
src_t
>
nppS
=
s
;
nppSafeCall
(
func
(
nppS
[
0
],
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
sz
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
int
SDEPTH
,
int
SCN
>
struct
NppSetMaskFunc
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
src_t
values
[],
src_t
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
,
const
Npp8u
*
pMask
,
int
nMaskStep
);
};
template
<
int
SDEPTH
>
struct
NppSetMaskFunc
<
SDEPTH
,
1
>
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
src_t
val
,
src_t
*
pSrc
,
int
nSrcStep
,
NppiSize
oSizeROI
,
const
Npp8u
*
pMask
,
int
nMaskStep
);
};
template
<
int
SDEPTH
,
int
SCN
,
typename
NppSetMaskFunc
<
SDEPTH
,
SCN
>::
func_ptr
func
>
struct
NppSetMask
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
Scalar_
<
src_t
>
nppS
=
s
;
nppSafeCall
(
func
(
nppS
.
val
,
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
sz
,
mask
.
ptr
<
Npp8u
>
(),
static_cast
<
int
>
(
mask
.
step
))
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
int
SDEPTH
,
typename
NppSetMaskFunc
<
SDEPTH
,
1
>::
func_ptr
func
>
struct
NppSetMask
<
SDEPTH
,
1
,
func
>
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
Scalar_
<
src_t
>
nppS
=
s
;
nppSafeCall
(
func
(
nppS
[
0
],
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
sz
,
mask
.
ptr
<
Npp8u
>
(),
static_cast
<
int
>
(
mask
.
step
))
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
//////////////////////////////////////////////////////////////////////////
// CopyMasked
template
<
int
SDEPTH
>
struct
NppCopyMaskedFunc
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
typedef
NppStatus
(
*
func_ptr
)(
const
src_t
*
pSrc
,
int
nSrcStep
,
src_t
*
pDst
,
int
nDstStep
,
NppiSize
oSizeROI
,
const
Npp8u
*
pMask
,
int
nMaskStep
);
};
template
<
int
SDEPTH
,
typename
NppCopyMaskedFunc
<
SDEPTH
>::
func_ptr
func
>
struct
NppCopyMasked
{
typedef
typename
NPPTypeTraits
<
SDEPTH
>::
npp_type
src_t
;
static
void
call
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
/*stream*/
)
{
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
func
(
src
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
src
.
step
),
dst
.
ptr
<
src_t
>
(),
static_cast
<
int
>
(
dst
.
step
),
sz
,
mask
.
ptr
<
Npp8u
>
(),
static_cast
<
int
>
(
mask
.
step
))
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
};
template
<
typename
T
>
static
inline
bool
isAligned
(
const
T
*
ptr
,
size_t
size
)
{
return
reinterpret_cast
<
size_t
>
(
ptr
)
%
size
==
0
;
}
}
namespace
cv
{
namespace
gpu
{
namespace
devices
{
void
copyWithMask
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
stream
=
0
)
{
CV_Assert
(
src
.
size
()
==
dst
.
size
()
&&
src
.
type
()
==
dst
.
type
());
CV_Assert
(
src
.
size
()
==
mask
.
size
()
&&
mask
.
depth
()
==
CV_8U
&&
(
mask
.
channels
()
==
1
||
mask
.
channels
()
==
src
.
channels
()));
cv
::
gpu
::
device
::
copyToWithMask_gpu
(
src
.
reshape
(
1
),
dst
.
reshape
(
1
),
src
.
elemSize1
(),
src
.
channels
(),
mask
.
reshape
(
1
),
mask
.
channels
()
!=
1
,
stream
);
}
void
convertTo
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
{
cv
::
gpu
::
device
::
convert_gpu
(
src
.
reshape
(
1
),
src
.
depth
(),
dst
.
reshape
(
1
),
dst
.
depth
(),
1.0
,
0.0
,
0
);
}
void
convertTo
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
alpha
,
double
beta
,
cudaStream_t
stream
=
0
)
{
cv
::
gpu
::
device
::
convert_gpu
(
src
.
reshape
(
1
),
src
.
depth
(),
dst
.
reshape
(
1
),
dst
.
depth
(),
alpha
,
beta
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
cudaStream_t
stream
)
{
typedef
void
(
*
caller_t
)(
GpuMat
&
src
,
Scalar
s
,
cudaStream_t
stream
);
static
const
caller_t
callers
[]
=
{
kernelSetCaller
<
uchar
>
,
kernelSetCaller
<
schar
>
,
kernelSetCaller
<
ushort
>
,
kernelSetCaller
<
short
>
,
kernelSetCaller
<
int
>
,
kernelSetCaller
<
float
>
,
kernelSetCaller
<
double
>
};
callers
[
src
.
depth
()](
src
,
s
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
typedef
void
(
*
caller_t
)(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
);
static
const
caller_t
callers
[]
=
{
kernelSetCaller
<
uchar
>
,
kernelSetCaller
<
schar
>
,
kernelSetCaller
<
ushort
>
,
kernelSetCaller
<
short
>
,
kernelSetCaller
<
int
>
,
kernelSetCaller
<
float
>
,
kernelSetCaller
<
double
>
};
callers
[
src
.
depth
()](
src
,
s
,
mask
,
stream
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
)
{
setTo
(
src
,
s
,
0
);
}
void
setTo
(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
)
{
setTo
(
src
,
s
,
mask
,
0
);
}
}}
namespace
{
class
CudaFuncTable
:
public
GpuFuncTable
{
protected
:
class
CudaArch
{
public
:
CudaArch
();
bool
builtWith
(
FeatureSet
feature_set
)
const
;
bool
hasPtx
(
int
major
,
int
minor
)
const
;
bool
hasBin
(
int
major
,
int
minor
)
const
;
bool
hasEqualOrLessPtx
(
int
major
,
int
minor
)
const
;
bool
hasEqualOrGreaterPtx
(
int
major
,
int
minor
)
const
;
bool
hasEqualOrGreaterBin
(
int
major
,
int
minor
)
const
;
private
:
static
void
fromStr
(
const
string
&
set_as_str
,
vector
<
int
>&
arr
);
vector
<
int
>
bin
;
vector
<
int
>
ptx
;
vector
<
int
>
features
;
};
const
CudaArch
cudaArch
;
CudaArch
::
CudaArch
()
{
fromStr
(
CUDA_ARCH_BIN
,
bin
);
fromStr
(
CUDA_ARCH_PTX
,
ptx
);
fromStr
(
CUDA_ARCH_FEATURES
,
features
);
}
bool
CudaArch
::
builtWith
(
FeatureSet
feature_set
)
const
{
return
!
features
.
empty
()
&&
(
features
.
back
()
>=
feature_set
);
}
bool
CudaArch
::
hasPtx
(
int
major
,
int
minor
)
const
{
return
find
(
ptx
.
begin
(),
ptx
.
end
(),
major
*
10
+
minor
)
!=
ptx
.
end
();
}
bool
CudaArch
::
hasBin
(
int
major
,
int
minor
)
const
{
return
find
(
bin
.
begin
(),
bin
.
end
(),
major
*
10
+
minor
)
!=
bin
.
end
();
}
bool
CudaArch
::
hasEqualOrLessPtx
(
int
major
,
int
minor
)
const
{
return
!
ptx
.
empty
()
&&
(
ptx
.
front
()
<=
major
*
10
+
minor
);
}
bool
CudaArch
::
hasEqualOrGreaterPtx
(
int
major
,
int
minor
)
const
{
return
!
ptx
.
empty
()
&&
(
ptx
.
back
()
>=
major
*
10
+
minor
);
}
bool
CudaArch
::
hasEqualOrGreaterBin
(
int
major
,
int
minor
)
const
{
return
!
bin
.
empty
()
&&
(
bin
.
back
()
>=
major
*
10
+
minor
);
}
void
CudaArch
::
fromStr
(
const
string
&
set_as_str
,
vector
<
int
>&
arr
)
{
if
(
set_as_str
.
find_first_not_of
(
" "
)
==
string
::
npos
)
return
;
istringstream
stream
(
set_as_str
);
int
cur_value
;
while
(
!
stream
.
eof
())
{
stream
>>
cur_value
;
arr
.
push_back
(
cur_value
);
}
sort
(
arr
.
begin
(),
arr
.
end
());
}
class
DeviceProps
{
public
:
DeviceProps
();
~
DeviceProps
();
cudaDeviceProp
*
get
(
int
devID
);
private
:
std
::
vector
<
cudaDeviceProp
*>
props_
;
};
DeviceProps
::
DeviceProps
()
{
props_
.
resize
(
10
,
0
);
}
DeviceProps
::~
DeviceProps
()
{
for
(
size_t
i
=
0
;
i
<
props_
.
size
();
++
i
)
{
if
(
props_
[
i
])
delete
props_
[
i
];
}
props_
.
clear
();
}
cudaDeviceProp
*
DeviceProps
::
get
(
int
devID
)
{
if
(
devID
>=
(
int
)
props_
.
size
())
props_
.
resize
(
devID
+
5
,
0
);
if
(
!
props_
[
devID
])
{
props_
[
devID
]
=
new
cudaDeviceProp
;
cudaSafeCall
(
cudaGetDeviceProperties
(
props_
[
devID
],
devID
)
);
}
return
props_
[
devID
];
}
DeviceProps
deviceProps
;
int
convertSMVer2Cores
(
int
major
,
int
minor
)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef
struct
{
int
SM
;
// 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int
Cores
;
}
SMtoCores
;
SMtoCores
gpuArchCoresPerSM
[]
=
{
{
0x10
,
8
},
{
0x11
,
8
},
{
0x12
,
8
},
{
0x13
,
8
},
{
0x20
,
32
},
{
0x21
,
48
},
{
0x30
,
192
},
{
0x35
,
192
},
{
-
1
,
-
1
}
};
int
index
=
0
;
while
(
gpuArchCoresPerSM
[
index
].
SM
!=
-
1
)
{
if
(
gpuArchCoresPerSM
[
index
].
SM
==
((
major
<<
4
)
+
minor
)
)
return
gpuArchCoresPerSM
[
index
].
Cores
;
index
++
;
}
return
-
1
;
}
public
:
int
getCudaEnabledDeviceCount
()
const
{
int
count
;
cudaError_t
error
=
cudaGetDeviceCount
(
&
count
);
if
(
error
==
cudaErrorInsufficientDriver
)
return
-
1
;
if
(
error
==
cudaErrorNoDevice
)
return
0
;
cudaSafeCall
(
error
);
return
count
;
}
void
setDevice
(
int
device
)
const
{
cudaSafeCall
(
cudaSetDevice
(
device
)
);
}
int
getDevice
()
const
{
int
device
;
cudaSafeCall
(
cudaGetDevice
(
&
device
)
);
return
device
;
}
void
resetDevice
()
const
{
cudaSafeCall
(
cudaDeviceReset
()
);
}
bool
TargetArchs
::
builtWith
(
FeatureSet
feature_set
)
const
{
return
cudaArch
.
builtWith
(
feature_set
);
}
bool
TargetArchs
::
has
(
int
major
,
int
minor
)
const
{
return
hasPtx
(
major
,
minor
)
||
hasBin
(
major
,
minor
);
}
bool
TargetArchs
::
hasPtx
(
int
major
,
int
minor
)
const
{
return
cudaArch
.
hasPtx
(
major
,
minor
);
}
bool
TargetArchs
::
hasBin
(
int
major
,
int
minor
)
const
{
return
cudaArch
.
hasBin
(
major
,
minor
);
}
bool
TargetArchs
::
hasEqualOrLessPtx
(
int
major
,
int
minor
)
const
{
return
cudaArch
.
hasEqualOrLessPtx
(
major
,
minor
);
}
bool
TargetArchs
::
hasEqualOrGreater
(
int
major
,
int
minor
)
const
{
return
hasEqualOrGreaterPtx
(
major
,
minor
)
||
hasEqualOrGreaterBin
(
major
,
minor
);
}
bool
TargetArchs
::
hasEqualOrGreaterPtx
(
int
major
,
int
minor
)
const
{
return
cudaArch
.
hasEqualOrGreaterPtx
(
major
,
minor
);
}
bool
TargetArchs
::
hasEqualOrGreaterBin
(
int
major
,
int
minor
)
const
{
return
cudaArch
.
hasEqualOrGreaterBin
(
major
,
minor
);
}
bool
deviceSupports
(
FeatureSet
feature_set
)
const
{
static
int
versions
[]
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
static
const
int
cache_size
=
static_cast
<
int
>
(
sizeof
(
versions
)
/
sizeof
(
versions
[
0
]));
const
int
devId
=
getDevice
();
int
version
;
if
(
devId
<
cache_size
&&
versions
[
devId
]
>=
0
)
version
=
versions
[
devId
];
else
{
DeviceInfo
dev
(
devId
);
version
=
dev
.
majorVersion
()
*
10
+
dev
.
minorVersion
();
if
(
devId
<
cache_size
)
versions
[
devId
]
=
version
;
}
return
TargetArchs
::
builtWith
(
feature_set
)
&&
(
version
>=
feature_set
);
}
size_t
sharedMemPerBlock
()
const
{
return
deviceProps
.
get
(
device_id_
)
->
sharedMemPerBlock
;
}
void
queryMemory
(
size_t
&
_totalMemory
,
size_t
&
_freeMemory
)
const
{
int
prevDeviceID
=
getDevice
();
if
(
prevDeviceID
!=
device_id_
)
setDevice
(
device_id_
);
cudaSafeCall
(
cudaMemGetInfo
(
&
_freeMemory
,
&
_totalMemory
)
);
if
(
prevDeviceID
!=
device_id_
)
setDevice
(
prevDeviceID
);
}
size_t
freeMemory
()
const
{
size_t
_totalMemory
,
_freeMemory
;
queryMemory
(
_totalMemory
,
_freeMemory
);
return
_freeMemory
;
}
size_t
totalMemory
()
const
{
size_t
_totalMemory
,
_freeMemory
;
queryMemory
(
_totalMemory
,
_freeMemory
);
return
_totalMemory
;
}
bool
supports
(
FeatureSet
feature_set
)
const
{
int
version
=
majorVersion
()
*
10
+
minorVersion
();
return
version
>=
feature_set
;
}
bool
isCompatible
()
const
{
// Check PTX compatibility
if
(
TargetArchs
::
hasEqualOrLessPtx
(
majorVersion
(),
minorVersion
()))
return
true
;
// Check BIN compatibility
for
(
int
i
=
minorVersion
();
i
>=
0
;
--
i
)
if
(
TargetArchs
::
hasBin
(
majorVersion
(),
i
))
return
true
;
return
false
;
}
void
query
()
const
{
const
cudaDeviceProp
*
prop
=
deviceProps
.
get
(
device_id_
);
name_
=
prop
->
name
;
multi_processor_count_
=
prop
->
multiProcessorCount
;
majorVersion_
=
prop
->
major
;
minorVersion_
=
prop
->
minor
;
}
void
printCudaDeviceInfo
(
int
device
)
const
{
int
count
=
getCudaEnabledDeviceCount
();
bool
valid
=
(
device
>=
0
)
&&
(
device
<
count
);
int
beg
=
valid
?
device
:
0
;
int
end
=
valid
?
device
+
1
:
count
;
printf
(
"*** CUDA Device Query (Runtime API) version (CUDART static linking) ***
\n\n
"
);
printf
(
"Device count: %d
\n
"
,
count
);
int
driverVersion
=
0
,
runtimeVersion
=
0
;
cudaSafeCall
(
cudaDriverGetVersion
(
&
driverVersion
)
);
cudaSafeCall
(
cudaRuntimeGetVersion
(
&
runtimeVersion
)
);
const
char
*
computeMode
[]
=
{
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)"
,
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)"
,
"Prohibited (no host thread can use ::cudaSetDevice() with this device)"
,
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)"
,
"Unknown"
,
NULL
};
for
(
int
dev
=
beg
;
dev
<
end
;
++
dev
)
{
cudaDeviceProp
prop
;
cudaSafeCall
(
cudaGetDeviceProperties
(
&
prop
,
dev
)
);
printf
(
"
\n
Device %d:
\"
%s
\"\n
"
,
dev
,
prop
.
name
);
printf
(
" CUDA Driver Version / Runtime Version %d.%d / %d.%d
\n
"
,
driverVersion
/
1000
,
driverVersion
%
100
,
runtimeVersion
/
1000
,
runtimeVersion
%
100
);
printf
(
" CUDA Capability Major/Minor version number: %d.%d
\n
"
,
prop
.
major
,
prop
.
minor
);
printf
(
" Total amount of global memory: %.0f MBytes (%llu bytes)
\n
"
,
(
float
)
prop
.
totalGlobalMem
/
1048576.0
f
,
(
unsigned
long
long
)
prop
.
totalGlobalMem
);
int
cores
=
convertSMVer2Cores
(
prop
.
major
,
prop
.
minor
);
if
(
cores
>
0
)
printf
(
" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores
\n
"
,
prop
.
multiProcessorCount
,
cores
,
cores
*
prop
.
multiProcessorCount
);
printf
(
" GPU Clock Speed: %.2f GHz
\n
"
,
prop
.
clockRate
*
1e-6
f
);
printf
(
" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)
\n
"
,
prop
.
maxTexture1D
,
prop
.
maxTexture2D
[
0
],
prop
.
maxTexture2D
[
1
],
prop
.
maxTexture3D
[
0
],
prop
.
maxTexture3D
[
1
],
prop
.
maxTexture3D
[
2
]);
printf
(
" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d
\n
"
,
prop
.
maxTexture1DLayered
[
0
],
prop
.
maxTexture1DLayered
[
1
],
prop
.
maxTexture2DLayered
[
0
],
prop
.
maxTexture2DLayered
[
1
],
prop
.
maxTexture2DLayered
[
2
]);
printf
(
" Total amount of constant memory: %u bytes
\n
"
,
(
int
)
prop
.
totalConstMem
);
printf
(
" Total amount of shared memory per block: %u bytes
\n
"
,
(
int
)
prop
.
sharedMemPerBlock
);
printf
(
" Total number of registers available per block: %d
\n
"
,
prop
.
regsPerBlock
);
printf
(
" Warp size: %d
\n
"
,
prop
.
warpSize
);
printf
(
" Maximum number of threads per block: %d
\n
"
,
prop
.
maxThreadsPerBlock
);
printf
(
" Maximum sizes of each dimension of a block: %d x %d x %d
\n
"
,
prop
.
maxThreadsDim
[
0
],
prop
.
maxThreadsDim
[
1
],
prop
.
maxThreadsDim
[
2
]);
printf
(
" Maximum sizes of each dimension of a grid: %d x %d x %d
\n
"
,
prop
.
maxGridSize
[
0
],
prop
.
maxGridSize
[
1
],
prop
.
maxGridSize
[
2
]);
printf
(
" Maximum memory pitch: %u bytes
\n
"
,
(
int
)
prop
.
memPitch
);
printf
(
" Texture alignment: %u bytes
\n
"
,
(
int
)
prop
.
textureAlignment
);
printf
(
" Concurrent copy and execution: %s with %d copy engine(s)
\n
"
,
(
prop
.
deviceOverlap
?
"Yes"
:
"No"
),
prop
.
asyncEngineCount
);
printf
(
" Run time limit on kernels: %s
\n
"
,
prop
.
kernelExecTimeoutEnabled
?
"Yes"
:
"No"
);
printf
(
" Integrated GPU sharing Host Memory: %s
\n
"
,
prop
.
integrated
?
"Yes"
:
"No"
);
printf
(
" Support host page-locked memory mapping: %s
\n
"
,
prop
.
canMapHostMemory
?
"Yes"
:
"No"
);
printf
(
" Concurrent kernel execution: %s
\n
"
,
prop
.
concurrentKernels
?
"Yes"
:
"No"
);
printf
(
" Alignment requirement for Surfaces: %s
\n
"
,
prop
.
surfaceAlignment
?
"Yes"
:
"No"
);
printf
(
" Device has ECC support enabled: %s
\n
"
,
prop
.
ECCEnabled
?
"Yes"
:
"No"
);
printf
(
" Device is using TCC driver mode: %s
\n
"
,
prop
.
tccDriver
?
"Yes"
:
"No"
);
printf
(
" Device supports Unified Addressing (UVA): %s
\n
"
,
prop
.
unifiedAddressing
?
"Yes"
:
"No"
);
printf
(
" Device PCI Bus ID / PCI location ID: %d / %d
\n
"
,
prop
.
pciBusID
,
prop
.
pciDeviceID
);
printf
(
" Compute Mode:
\n
"
);
printf
(
" %s
\n
"
,
computeMode
[
prop
.
computeMode
]);
}
printf
(
"
\n
"
);
printf
(
"deviceQuery, CUDA Driver = CUDART"
);
printf
(
", CUDA Driver Version = %d.%d"
,
driverVersion
/
1000
,
driverVersion
%
100
);
printf
(
", CUDA Runtime Version = %d.%d"
,
runtimeVersion
/
1000
,
runtimeVersion
%
100
);
printf
(
", NumDevs = %d
\n\n
"
,
count
);
fflush
(
stdout
);
}
void
printShortCudaDeviceInfo
(
int
device
)
const
{
int
count
=
getCudaEnabledDeviceCount
();
bool
valid
=
(
device
>=
0
)
&&
(
device
<
count
);
int
beg
=
valid
?
device
:
0
;
int
end
=
valid
?
device
+
1
:
count
;
int
driverVersion
=
0
,
runtimeVersion
=
0
;
cudaSafeCall
(
cudaDriverGetVersion
(
&
driverVersion
)
);
cudaSafeCall
(
cudaRuntimeGetVersion
(
&
runtimeVersion
)
);
for
(
int
dev
=
beg
;
dev
<
end
;
++
dev
)
{
cudaDeviceProp
prop
;
cudaSafeCall
(
cudaGetDeviceProperties
(
&
prop
,
dev
)
);
const
char
*
arch_str
=
prop
.
major
<
2
?
" (not Fermi)"
:
""
;
printf
(
"Device %d:
\"
%s
\"
%.0fMb"
,
dev
,
prop
.
name
,
(
float
)
prop
.
totalGlobalMem
/
1048576.0
f
);
printf
(
", sm_%d%d%s"
,
prop
.
major
,
prop
.
minor
,
arch_str
);
int
cores
=
convertSMVer2Cores
(
prop
.
major
,
prop
.
minor
);
if
(
cores
>
0
)
printf
(
", %d cores"
,
cores
*
prop
.
multiProcessorCount
);
printf
(
", Driver/Runtime ver.%d.%d/%d.%d
\n
"
,
driverVersion
/
1000
,
driverVersion
%
100
,
runtimeVersion
/
1000
,
runtimeVersion
%
100
);
}
fflush
(
stdout
);
}
void
copy
(
const
Mat
&
src
,
GpuMat
&
dst
)
const
{
cudaSafeCall
(
cudaMemcpy2D
(
dst
.
data
,
dst
.
step
,
src
.
data
,
src
.
step
,
src
.
cols
*
src
.
elemSize
(),
src
.
rows
,
cudaMemcpyHostToDevice
)
);
}
void
copy
(
const
GpuMat
&
src
,
Mat
&
dst
)
const
{
cudaSafeCall
(
cudaMemcpy2D
(
dst
.
data
,
dst
.
step
,
src
.
data
,
src
.
step
,
src
.
cols
*
src
.
elemSize
(),
src
.
rows
,
cudaMemcpyDeviceToHost
)
);
}
void
copy
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
const
{
cudaSafeCall
(
cudaMemcpy2D
(
dst
.
data
,
dst
.
step
,
src
.
data
,
src
.
step
,
src
.
cols
*
src
.
elemSize
(),
src
.
rows
,
cudaMemcpyDeviceToDevice
)
);
}
void
copyWithMask
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
)
const
{
CV_Assert
(
src
.
depth
()
<=
CV_64F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
src
.
size
()
==
dst
.
size
()
&&
src
.
type
()
==
dst
.
type
());
CV_Assert
(
src
.
size
()
==
mask
.
size
()
&&
mask
.
depth
()
==
CV_8U
&&
(
mask
.
channels
()
==
1
||
mask
.
channels
()
==
src
.
channels
()));
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
typedef
void
(
*
func_t
)(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
stream
);
static
const
func_t
funcs
[
7
][
4
]
=
{
/* 8U */
{
NppCopyMasked
<
CV_8U
,
nppiCopy_8u_C1MR
>::
call
,
cv
::
gpu
::
details
::
copyWithMask
,
NppCopyMasked
<
CV_8U
,
nppiCopy_8u_C3MR
>::
call
,
NppCopyMasked
<
CV_8U
,
nppiCopy_8u_C4MR
>::
call
},
/* 8S */
{
cv
::
gpu
::
details
::
copyWithMask
,
cv
::
gpu
::
details
::
copyWithMask
,
cv
::
gpu
::
details
::
copyWithMask
,
cv
::
gpu
::
details
::
copyWithMask
},
/* 16U */
{
NppCopyMasked
<
CV_16U
,
nppiCopy_16u_C1MR
>::
call
,
cv
::
gpu
::
details
::
copyWithMask
,
NppCopyMasked
<
CV_16U
,
nppiCopy_16u_C3MR
>::
call
,
NppCopyMasked
<
CV_16U
,
nppiCopy_16u_C4MR
>::
call
},
/* 16S */
{
NppCopyMasked
<
CV_16S
,
nppiCopy_16s_C1MR
>::
call
,
cv
::
gpu
::
details
::
copyWithMask
,
NppCopyMasked
<
CV_16S
,
nppiCopy_16s_C3MR
>::
call
,
NppCopyMasked
<
CV_16S
,
nppiCopy_16s_C4MR
>::
call
},
/* 32S */
{
NppCopyMasked
<
CV_32S
,
nppiCopy_32s_C1MR
>::
call
,
cv
::
gpu
::
details
::
copyWithMask
,
NppCopyMasked
<
CV_32S
,
nppiCopy_32s_C3MR
>::
call
,
NppCopyMasked
<
CV_32S
,
nppiCopy_32s_C4MR
>::
call
},
/* 32F */
{
NppCopyMasked
<
CV_32F
,
nppiCopy_32f_C1MR
>::
call
,
cv
::
gpu
::
details
::
copyWithMask
,
NppCopyMasked
<
CV_32F
,
nppiCopy_32f_C3MR
>::
call
,
NppCopyMasked
<
CV_32F
,
nppiCopy_32f_C4MR
>::
call
},
/* 64F */
{
cv
::
gpu
::
details
::
copyWithMask
,
cv
::
gpu
::
details
::
copyWithMask
,
cv
::
gpu
::
details
::
copyWithMask
,
cv
::
gpu
::
details
::
copyWithMask
}
};
const
func_t
func
=
mask
.
channels
()
==
src
.
channels
()
?
funcs
[
src
.
depth
()][
src
.
channels
()
-
1
]
:
cv
::
gpu
::
details
::
copyWithMask
;
func
(
src
,
dst
,
mask
,
0
);
}
void
convert
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
const
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src
,
GpuMat
&
dst
);
static
const
func_t
funcs
[
7
][
7
][
4
]
=
{
{
/* 8U -> 8U */
{
0
,
0
,
0
,
0
},
/* 8U -> 8S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 8U -> 16U */
{
NppCvt
<
CV_8U
,
CV_16U
,
nppiConvert_8u16u_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
NppCvt
<
CV_8U
,
CV_16U
,
nppiConvert_8u16u_C4R
>::
call
},
/* 8U -> 16S */
{
NppCvt
<
CV_8U
,
CV_16S
,
nppiConvert_8u16s_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
NppCvt
<
CV_8U
,
CV_16S
,
nppiConvert_8u16s_C4R
>::
call
},
/* 8U -> 32S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 8U -> 32F */
{
NppCvt
<
CV_8U
,
CV_32F
,
nppiConvert_8u32f_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 8U -> 64F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
}
},
{
/* 8S -> 8U */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 8S -> 8S */
{
0
,
0
,
0
,
0
},
/* 8S -> 16U */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 8S -> 16S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 8S -> 32S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 8S -> 32F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 8S -> 64F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
}
},
{
/* 16U -> 8U */
{
NppCvt
<
CV_16U
,
CV_8U
,
nppiConvert_16u8u_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
NppCvt
<
CV_16U
,
CV_8U
,
nppiConvert_16u8u_C4R
>::
call
},
/* 16U -> 8S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 16U -> 16U */
{
0
,
0
,
0
,
0
},
/* 16U -> 16S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 16U -> 32S */
{
NppCvt
<
CV_16U
,
CV_32S
,
nppiConvert_16u32s_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 16U -> 32F */
{
NppCvt
<
CV_16U
,
CV_32F
,
nppiConvert_16u32f_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 16U -> 64F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
}
},
{
/* 16S -> 8U */
{
NppCvt
<
CV_16S
,
CV_8U
,
nppiConvert_16s8u_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
NppCvt
<
CV_16S
,
CV_8U
,
nppiConvert_16s8u_C4R
>::
call
},
/* 16S -> 8S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 16S -> 16U */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 16S -> 16S */
{
0
,
0
,
0
,
0
},
/* 16S -> 32S */
{
NppCvt
<
CV_16S
,
CV_32S
,
nppiConvert_16s32s_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 16S -> 32F */
{
NppCvt
<
CV_16S
,
CV_32F
,
nppiConvert_16s32f_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 16S -> 64F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
}
},
{
/* 32S -> 8U */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32S -> 8S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32S -> 16U */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32S -> 16S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32S -> 32S */
{
0
,
0
,
0
,
0
},
/* 32S -> 32F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32S -> 64F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
}
},
{
/* 32F -> 8U */
{
NppCvt
<
CV_32F
,
CV_8U
,
nppiConvert_32f8u_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32F -> 8S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32F -> 16U */
{
NppCvt
<
CV_32F
,
CV_16U
,
nppiConvert_32f16u_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32F -> 16S */
{
NppCvt
<
CV_32F
,
CV_16S
,
nppiConvert_32f16s_C1R
>::
call
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32F -> 32S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 32F -> 32F */
{
0
,
0
,
0
,
0
},
/* 32F -> 64F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
}
},
{
/* 64F -> 8U */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 64F -> 8S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 64F -> 16U */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 64F -> 16S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 64F -> 32S */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 64F -> 32F */
{
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
,
cv
::
gpu
::
device
::
convertTo
},
/* 64F -> 64F */
{
0
,
0
,
0
,
0
}
}
};
CV_Assert
(
src
.
depth
()
<=
CV_64F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
dst
.
depth
()
<=
CV_64F
);
CV_Assert
(
src
.
size
()
==
dst
.
size
()
&&
src
.
channels
()
==
dst
.
channels
());
if
(
src
.
depth
()
==
CV_64F
||
dst
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
bool
aligned
=
isAligned
(
src
.
data
,
16
)
&&
isAligned
(
dst
.
data
,
16
);
if
(
!
aligned
)
{
cv
::
gpu
::
device
::
convertTo
(
src
,
dst
);
return
;
}
const
func_t
func
=
funcs
[
src
.
depth
()][
dst
.
depth
()][
src
.
channels
()
-
1
];
CV_DbgAssert
(
func
!=
0
);
func
(
src
,
dst
);
}
void
convert
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
alpha
,
double
beta
)
const
{
CV_Assert
(
src
.
depth
()
<=
CV_64F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
dst
.
depth
()
<=
CV_64F
);
if
(
src
.
depth
()
==
CV_64F
||
dst
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
cv
::
gpu
::
device
::
convertTo
(
src
,
dst
,
alpha
,
beta
);
}
void
setTo
(
GpuMat
&
m
,
Scalar
s
,
const
GpuMat
&
mask
)
const
{
if
(
mask
.
empty
())
{
if
(
s
[
0
]
==
0.0
&&
s
[
1
]
==
0.0
&&
s
[
2
]
==
0.0
&&
s
[
3
]
==
0.0
)
{
cudaSafeCall
(
cudaMemset2D
(
m
.
data
,
m
.
step
,
0
,
m
.
cols
*
m
.
elemSize
(),
m
.
rows
)
);
return
;
}
if
(
m
.
depth
()
==
CV_8U
)
{
int
cn
=
m
.
channels
();
if
(
cn
==
1
||
(
cn
==
2
&&
s
[
0
]
==
s
[
1
])
||
(
cn
==
3
&&
s
[
0
]
==
s
[
1
]
&&
s
[
0
]
==
s
[
2
])
||
(
cn
==
4
&&
s
[
0
]
==
s
[
1
]
&&
s
[
0
]
==
s
[
2
]
&&
s
[
0
]
==
s
[
3
]))
{
int
val
=
saturate_cast
<
uchar
>
(
s
[
0
]);
cudaSafeCall
(
cudaMemset2D
(
m
.
data
,
m
.
step
,
val
,
m
.
cols
*
m
.
elemSize
(),
m
.
rows
)
);
return
;
}
}
typedef
void
(
*
func_t
)(
GpuMat
&
src
,
Scalar
s
);
static
const
func_t
funcs
[
7
][
4
]
=
{
{
NppSet
<
CV_8U
,
1
,
nppiSet_8u_C1R
>::
call
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
NppSet
<
CV_8U
,
4
,
nppiSet_8u_C4R
>::
call
},
{
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
},
{
NppSet
<
CV_16U
,
1
,
nppiSet_16u_C1R
>::
call
,
NppSet
<
CV_16U
,
2
,
nppiSet_16u_C2R
>::
call
,
cv
::
gpu
::
device
::
setTo
,
NppSet
<
CV_16U
,
4
,
nppiSet_16u_C4R
>::
call
},
{
NppSet
<
CV_16S
,
1
,
nppiSet_16s_C1R
>::
call
,
NppSet
<
CV_16S
,
2
,
nppiSet_16s_C2R
>::
call
,
cv
::
gpu
::
device
::
setTo
,
NppSet
<
CV_16S
,
4
,
nppiSet_16s_C4R
>::
call
},
{
NppSet
<
CV_32S
,
1
,
nppiSet_32s_C1R
>::
call
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
NppSet
<
CV_32S
,
4
,
nppiSet_32s_C4R
>::
call
},
{
NppSet
<
CV_32F
,
1
,
nppiSet_32f_C1R
>::
call
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
NppSet
<
CV_32F
,
4
,
nppiSet_32f_C4R
>::
call
},
{
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
}
};
CV_Assert
(
m
.
depth
()
<=
CV_64F
&&
m
.
channels
()
<=
4
);
if
(
m
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
funcs
[
m
.
depth
()][
m
.
channels
()
-
1
](
m
,
s
);
}
else
{
typedef
void
(
*
func_t
)(
GpuMat
&
src
,
Scalar
s
,
const
GpuMat
&
mask
);
static
const
func_t
funcs
[
7
][
4
]
=
{
{
NppSetMask
<
CV_8U
,
1
,
nppiSet_8u_C1MR
>::
call
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
NppSetMask
<
CV_8U
,
4
,
nppiSet_8u_C4MR
>::
call
},
{
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
},
{
NppSetMask
<
CV_16U
,
1
,
nppiSet_16u_C1MR
>::
call
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
NppSetMask
<
CV_16U
,
4
,
nppiSet_16u_C4MR
>::
call
},
{
NppSetMask
<
CV_16S
,
1
,
nppiSet_16s_C1MR
>::
call
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
NppSetMask
<
CV_16S
,
4
,
nppiSet_16s_C4MR
>::
call
},
{
NppSetMask
<
CV_32S
,
1
,
nppiSet_32s_C1MR
>::
call
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
NppSetMask
<
CV_32S
,
4
,
nppiSet_32s_C4MR
>::
call
},
{
NppSetMask
<
CV_32F
,
1
,
nppiSet_32f_C1MR
>::
call
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
NppSetMask
<
CV_32F
,
4
,
nppiSet_32f_C4MR
>::
call
},
{
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
,
cv
::
gpu
::
device
::
setTo
}
};
CV_Assert
(
m
.
depth
()
<=
CV_64F
&&
m
.
channels
()
<=
4
);
if
(
m
.
depth
()
==
CV_64F
)
{
if
(
!
TargetArchs
::
builtWith
(
NATIVE_DOUBLE
)
||
!
DeviceInfo
().
supports
(
NATIVE_DOUBLE
))
CV_Error
(
CV_StsUnsupportedFormat
,
"The device doesn't support double"
);
}
funcs
[
m
.
depth
()][
m
.
channels
()
-
1
](
m
,
s
,
mask
);
}
}
void
mallocPitch
(
void
**
devPtr
,
size_t
*
step
,
size_t
width
,
size_t
height
)
const
{
cudaSafeCall
(
cudaMallocPitch
(
devPtr
,
step
,
width
,
height
)
);
}
void
free
(
void
*
devPtr
)
const
{
cudaFree
(
devPtr
);
}
};
}
#endif
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment