Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
2777ebb8
Commit
2777ebb8
authored
Jun 28, 2012
by
Marina Kolpakova
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
merged GPU scan
parent
6cca6a45
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
25 changed files
with
450 additions
and
200 deletions
+450
-200
OpenCVDetectCUDA.cmake
cmake/OpenCVDetectCUDA.cmake
+5
-5
perf_imgproc.cpp
modules/gpu/perf/perf_imgproc.cpp
+34
-0
matrix_reductions.cu
modules/gpu/src/cuda/matrix_reductions.cu
+1
-1
resize.cu
modules/gpu/src/cuda/resize.cu
+2
-1
split_merge.cu
modules/gpu/src/cuda/split_merge.cu
+18
-18
NCVBroxOpticalFlow.cu
modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+0
-0
NCV.cu
modules/gpu/src/nvidia/core/NCV.cu
+8
-8
common.hpp
modules/gpu/src/opencv2/gpu/device/common.hpp
+7
-7
datamov_utils.hpp
modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+10
-10
dynamic_smem.hpp
modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+1
-1
emulation.hpp
modules/gpu/src/opencv2/gpu/device/emulation.hpp
+9
-9
funcattrib.hpp
modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+4
-4
functional.hpp
modules/gpu/src/opencv2/gpu/device/functional.hpp
+39
-17
limits.hpp
modules/gpu/src/opencv2/gpu/device/limits.hpp
+1
-1
saturate_cast.hpp
modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+56
-56
scan.hpp
modules/gpu/src/opencv2/gpu/device/scan.hpp
+167
-0
static_check.hpp
modules/gpu/src/opencv2/gpu/device/static_check.hpp
+12
-12
transform.hpp
modules/gpu/src/opencv2/gpu/device/transform.hpp
+1
-1
type_traits.hpp
modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+11
-11
utility.hpp
modules/gpu/src/opencv2/gpu/device/utility.hpp
+16
-16
vec_distance.hpp
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+4
-4
vec_math.hpp
modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+4
-3
vec_traits.hpp
modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+7
-7
warp.hpp
modules/gpu/src/opencv2/gpu/device/warp.hpp
+7
-7
resize.cpp
modules/gpu/src/resize.cpp
+26
-1
No files found.
cmake/OpenCVDetectCUDA.cmake
View file @
2777ebb8
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"2.8.3"
)
message
(
STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.
)
return
()
return
()
endif
()
find_package
(
CUDA 4.1
)
if
(
CUDA_FOUND
)
...
...
@@ -23,7 +23,7 @@ if(CUDA_FOUND)
else
()
set
(
CUDA_ARCH_BIN
"1.1 1.2 1.3 2.0 2.1(2.0)"
CACHE STRING
"Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported"
)
endif
()
set
(
CUDA_ARCH_PTX
"2.0"
CACHE STRING
"Specify 'virtual' PTX architectures to build PTX intermediate code for"
)
string
(
REGEX REPLACE
"
\\
."
""
ARCH_BIN_NO_POINTS
"
${
CUDA_ARCH_BIN
}
"
)
...
...
@@ -89,8 +89,8 @@ if(CUDA_FOUND)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
-Xcompiler -fno-finite-math-only
)
endif
()
# we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
set
(
CMAKE_CXX_FLAGS_DEBUG_
${
CMAKE_CXX_FLAGS_DEBUG
}
)
# we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
set
(
CMAKE_CXX_FLAGS_DEBUG_
${
CMAKE_CXX_FLAGS_DEBUG
}
)
string
(
REPLACE
"-ggdb3"
""
CMAKE_CXX_FLAGS_DEBUG
${
CMAKE_CXX_FLAGS_DEBUG
}
)
CUDA_COMPILE
(
${
VAR
}
${
ARGN
}
)
set
(
CMAKE_CXX_DEBUG_FLAGS
${
CMAKE_CXX_FLAGS_DEBUG_
}
)
...
...
modules/gpu/perf/perf_imgproc.cpp
View file @
2777ebb8
...
...
@@ -90,6 +90,40 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
Interpolation
(
cv
::
INTER_CUBIC
),
Interpolation
(
cv
::
INTER_AREA
)),
testing
::
Values
(
Scale
(
0.5
),
Scale
(
0.3
),
Scale
(
2.0
))));
GPU_PERF_TEST
(
ResizeArea
,
cv
::
gpu
::
DeviceInfo
,
cv
::
Size
,
MatType
,
Scale
)
{
cv
::
gpu
::
DeviceInfo
devInfo
=
GET_PARAM
(
0
);
cv
::
gpu
::
setDevice
(
devInfo
.
deviceID
());
cv
::
Size
size
=
GET_PARAM
(
1
);
int
type
=
GET_PARAM
(
2
);
int
interpolation
=
cv
::
INTER_AREA
;
double
f
=
GET_PARAM
(
3
);
cv
::
Mat
src_host
(
size
,
type
);
fill
(
src_host
,
0
,
255
);
cv
::
gpu
::
GpuMat
src
(
src_host
);
cv
::
gpu
::
GpuMat
dst
;
cv
::
gpu
::
resize
(
src
,
dst
,
cv
::
Size
(),
f
,
f
,
interpolation
);
declare
.
time
(
1.0
);
TEST_CYCLE
()
{
cv
::
gpu
::
resize
(
src
,
dst
,
cv
::
Size
(),
f
,
f
,
interpolation
);
}
}
INSTANTIATE_TEST_CASE_P
(
ImgProc
,
ResizeArea
,
testing
::
Combine
(
ALL_DEVICES
,
testing
::
Values
(
perf
::
sz1080p
/*, cv::Size(4096, 2048)*/
),
testing
::
Values
(
MatType
(
CV_8UC1
),
MatType
(
CV_8UC3
),
MatType
(
CV_8UC4
),
MatType
(
CV_16UC1
),
MatType
(
CV_16UC3
),
MatType
(
CV_16UC4
),
MatType
(
CV_32FC1
),
MatType
(
CV_32FC3
),
MatType
(
CV_32FC4
)),
testing
::
Values
(
Scale
(
0.2
),
Scale
(
0.1
),
Scale
(
0.05
))));
//////////////////////////////////////////////////////////////////////
// WarpAffine
...
...
modules/gpu/src/cuda/matrix_reductions.cu
View file @
2777ebb8
...
...
@@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device
struct Mask8U
{
explicit Mask8U(PtrStepb mask
): mask(mask
) {}
explicit Mask8U(PtrStepb mask
_): mask(mask_
) {}
__device__ __forceinline__ bool operator()(int y, int x) const
{
...
...
modules/gpu/src/cuda/resize.cu
View file @
2777ebb8
...
...
@@ -46,7 +46,8 @@
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp"
# include <cfloat>
#include <cfloat>
#include <opencv2/gpu/device/scan.hpp>
namespace cv { namespace gpu { namespace device
{
...
...
modules/gpu/src/cuda/split_merge.cu
View file @
2777ebb8
...
...
@@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC2_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC2_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
...
...
@@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC3_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC3_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
...
...
@@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC4_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC4_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
...
...
@@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC2_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC2_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
...
...
@@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC3_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC3_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
...
...
@@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC4_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC4_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
...
...
modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
View file @
2777ebb8
This diff is collapsed.
Click to expand it.
modules/gpu/src/nvidia/core/NCV.cu
View file @
2777ebb8
...
...
@@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
//===================================================================
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment
_
)
:
currentSize(0),
_maxSize(0),
...
...
@@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
begin(NULL),
end(NULL),
_memType(NCVMemoryTypeNone),
_alignment(alignment),
_alignment(alignment
_
),
bReusesMemory(false)
{
NcvBool bProperAlignment = (alignment
& (alignment-
1)) == 0;
NcvBool bProperAlignment = (alignment
_ & (alignment_ -
1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
}
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment
_
, void *reusePtr)
:
currentSize(0),
_maxSize(0),
allocBegin(NULL),
_memType(memT),
_alignment(alignment)
_alignment(alignment
_
)
{
NcvBool bProperAlignment = (alignment
& (alignment-
1)) == 0;
NcvBool bProperAlignment = (alignment
_ & (alignment_ -
1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
...
...
@@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
//===================================================================
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment
_
)
:
currentSize(0),
_maxSize(0),
_memType(memT),
_alignment(alignment)
_alignment(alignment
_
)
{
ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
}
...
...
modules/gpu/src/opencv2/gpu/device/common.hpp
View file @
2777ebb8
...
...
@@ -64,7 +64,7 @@
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)
#endif
namespace
cv
{
namespace
gpu
namespace
cv
{
namespace
gpu
{
void
error
(
const
char
*
error_string
,
const
char
*
file
,
const
int
line
,
const
char
*
func
);
...
...
@@ -87,14 +87,14 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
#ifdef __CUDACC__
namespace
cv
{
namespace
gpu
{
__host__
__device__
__forceinline__
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
namespace
cv
{
namespace
gpu
{
__host__
__device__
__forceinline__
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
}
namespace
device
namespace
device
{
typedef
unsigned
char
uchar
;
typedef
unsigned
short
ushort
;
...
...
modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
View file @
2777ebb8
...
...
@@ -45,7 +45,7 @@
#include "common.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
...
...
@@ -54,13 +54,13 @@ namespace cv { namespace gpu { namespace device
{
__device__
__forceinline__
static
void
Load
(
const
T
*
ptr
,
int
offset
,
T
&
val
)
{
val
=
ptr
[
offset
];
}
};
#else // __CUDA_ARCH__ >= 200
#if defined(_WIN64) || defined(__LP64__)
#else // __CUDA_ARCH__ >= 200
#if defined(_WIN64) || defined(__LP64__)
// 64-bit register modifier for inlined asm
#define OPENCV_GPU_ASM_PTR "l"
#else
#else
// 32-bit register modifier for inlined asm
#define OPENCV_GPU_ASM_PTR "r"
#endif
...
...
@@ -84,21 +84,21 @@ namespace cv { namespace gpu { namespace device
asm
(
"ld.global."
#
ptx_type
" %0, [%1];"
:
"=r"
(
*
reinterpret_cast
<
uint
*>
(
&
val
))
:
OPENCV_GPU_ASM_PTR
(
ptr
+
offset
));
\
}
\
};
OPENCV_GPU_DEFINE_FORCE_GLOB_B
(
uchar
,
u8
)
OPENCV_GPU_DEFINE_FORCE_GLOB_B
(
schar
,
s8
)
OPENCV_GPU_DEFINE_FORCE_GLOB_B
(
char
,
b8
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
ushort
,
u16
,
h
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
short
,
s16
,
h
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
uint
,
u32
,
r
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
int
,
s32
,
r
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
float
,
f32
,
f
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
double
,
f64
,
d
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
int
,
s32
,
r
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
float
,
f32
,
f
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
double
,
f64
,
d
)
#undef OPENCV_GPU_DEFINE_FORCE_GLOB
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
#undef OPENCV_GPU_ASM_PTR
#endif // __CUDA_ARCH__ >= 200
}}}
// namespace cv { namespace gpu { namespace device
...
...
modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
View file @
2777ebb8
...
...
@@ -44,7 +44,7 @@
#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
namespace
cv
{
namespace
gpu
{
namespace
device
{
{
template
<
class
T
>
struct
DynamicSharedMem
{
__device__
__forceinline__
operator
T
*
()
...
...
modules/gpu/src/opencv2/gpu/device/emulation.hpp
View file @
2777ebb8
...
...
@@ -45,21 +45,21 @@
#include "warp_reduce.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
struct
Emulation
{
static
__forceinline__
__device__
int
Ballot
(
int
predicate
,
volatile
int
*
cta_buffer
)
{
static
__forceinline__
__device__
int
Ballot
(
int
predicate
,
volatile
int
*
cta_buffer
)
{
#if __CUDA_ARCH__ >= 200
(
void
)
cta_buffer
;
return
__ballot
(
predicate
);
(
void
)
cta_buffer
;
return
__ballot
(
predicate
);
#else
int
tid
=
threadIdx
.
x
;
cta_buffer
[
tid
]
=
predicate
?
(
1
<<
(
tid
&
31
))
:
0
;
return
warp_reduce
(
cta_buffer
);
int
tid
=
threadIdx
.
x
;
cta_buffer
[
tid
]
=
predicate
?
(
1
<<
(
tid
&
31
))
:
0
;
return
warp_reduce
(
cta_buffer
);
#endif
}
}
};
}}}
// namespace cv { namespace gpu { namespace device
...
...
modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
View file @
2777ebb8
...
...
@@ -46,14 +46,14 @@
#include <cstdio>
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
class
Func
>
template
<
class
Func
>
void
printFuncAttrib
(
Func
&
func
)
{
cudaFuncAttributes
attrs
;
cudaFuncGetAttributes
(
&
attrs
,
func
);
cudaFuncGetAttributes
(
&
attrs
,
func
);
printf
(
"=== Function stats ===
\n
"
);
printf
(
"Name:
\n
"
);
...
...
@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
printf
(
"ptxVersion = %d
\n
"
,
attrs
.
ptxVersion
);
printf
(
"binaryVersion = %d
\n
"
,
attrs
.
binaryVersion
);
printf
(
"
\n
"
);
fflush
(
stdout
);
fflush
(
stdout
);
}
}}}
// namespace cv { namespace gpu { namespace device
...
...
modules/gpu/src/opencv2/gpu/device/functional.hpp
View file @
2777ebb8
...
...
@@ -48,7 +48,7 @@
#include "vec_traits.hpp"
#include "type_traits.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
// Function Objects
...
...
@@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
struct
bit_not
:
unary_function
<
T
,
T
>
{
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
v
)
const
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
v
)
const
{
return
~
v
;
}
...
...
@@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device
// Generalized Identity Operations
template
<
typename
T
>
struct
identity
:
unary_function
<
T
,
T
>
{
__device__
__forceinline__
typename
TypeTraits
<
T
>::
ParameterType
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
x
)
const
__device__
__forceinline__
typename
TypeTraits
<
T
>::
ParameterType
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
x
)
const
{
return
x
;
}
...
...
@@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T1
,
typename
T2
>
struct
project1st
:
binary_function
<
T1
,
T2
,
T1
>
{
__device__
__forceinline__
typename
TypeTraits
<
T1
>::
ParameterType
operator
()(
typename
TypeTraits
<
T1
>::
ParameterType
lhs
,
typename
TypeTraits
<
T2
>::
ParameterType
rhs
)
const
__device__
__forceinline__
typename
TypeTraits
<
T1
>::
ParameterType
operator
()(
typename
TypeTraits
<
T1
>::
ParameterType
lhs
,
typename
TypeTraits
<
T2
>::
ParameterType
rhs
)
const
{
return
lhs
;
}
...
...
@@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T1
,
typename
T2
>
struct
project2nd
:
binary_function
<
T1
,
T2
,
T2
>
{
__device__
__forceinline__
typename
TypeTraits
<
T2
>::
ParameterType
operator
()(
typename
TypeTraits
<
T1
>::
ParameterType
lhs
,
typename
TypeTraits
<
T2
>::
ParameterType
rhs
)
const
__device__
__forceinline__
typename
TypeTraits
<
T2
>::
ParameterType
operator
()(
typename
TypeTraits
<
T1
>::
ParameterType
lhs
,
typename
TypeTraits
<
T2
>::
ParameterType
rhs
)
const
{
return
rhs
;
}
...
...
@@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
struct
maximum
:
binary_function
<
T
,
T
,
T
>
{
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
lhs
,
typename
TypeTraits
<
T
>::
ParameterType
rhs
)
const
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
lhs
,
typename
TypeTraits
<
T
>::
ParameterType
rhs
)
const
{
return
lhs
<
rhs
?
rhs
:
lhs
;
}
...
...
@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
struct
minimum
:
binary_function
<
T
,
T
,
T
>
{
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
lhs
,
typename
TypeTraits
<
T
>::
ParameterType
rhs
)
const
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
lhs
,
typename
TypeTraits
<
T
>::
ParameterType
rhs
)
const
{
return
lhs
<
rhs
?
lhs
:
rhs
;
}
...
...
@@ -410,12 +410,14 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
template
<
typename
T
>
struct
hypot_sqr_func
:
binary_function
<
T
,
T
,
float
>
template
<
typename
T
>
struct
hypot_sqr_func
:
binary_function
<
T
,
T
,
float
>
{
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
src1
,
typename
TypeTraits
<
T
>::
ParameterType
src2
)
const
{
return
src1
*
src1
+
src2
*
src2
;
}
__device__
__forceinline__
hypot_sqr_func
(
const
hypot_sqr_func
&
other
)
:
binary_function
<
T
,
T
,
float
>
(){}
__device__
__forceinline__
hypot_sqr_func
()
:
binary_function
<
T
,
T
,
float
>
(){}
};
// Saturate Cast Functor
...
...
@@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
{
return
(
src
>
thresh
)
*
maxVal
;
}
__device__
__forceinline__
thresh_binary_func
(
const
thresh_binary_func
&
other
)
:
unary_function
<
T
,
T
>
(),
thresh
(
other
.
thresh
),
maxVal
(
other
.
maxVal
){}
...
...
@@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
{
return
(
src
<=
thresh
)
*
maxVal
;
}
__device__
__forceinline__
thresh_binary_inv_func
(
const
thresh_binary_inv_func
&
other
)
:
unary_function
<
T
,
T
>
(),
thresh
(
other
.
thresh
),
maxVal
(
other
.
maxVal
){}
...
...
@@ -519,12 +523,16 @@ namespace cv { namespace gpu { namespace device
explicit
__host__
__device__
__forceinline__
unary_negate
(
const
Predicate
&
p
)
:
pred
(
p
)
{}
__device__
__forceinline__
bool
operator
()(
typename
TypeTraits
<
typename
Predicate
::
argument_type
>::
ParameterType
x
)
const
{
return
!
pred
(
x
);
{
return
!
pred
(
x
);
}
__device__
__forceinline__
unary_negate
(
const
unary_negate
&
other
)
:
unary_function
<
typename
Predicate
::
argument_type
,
bool
>
(){}
__device__
__forceinline__
unary_negate
()
:
unary_function
<
typename
Predicate
::
argument_type
,
bool
>
(){}
const
Predicate
pred
;
};
template
<
typename
Predicate
>
__host__
__device__
__forceinline__
unary_negate
<
Predicate
>
not1
(
const
Predicate
&
pred
)
{
return
unary_negate
<
Predicate
>
(
pred
);
...
...
@@ -534,19 +542,26 @@ namespace cv { namespace gpu { namespace device
{
explicit
__host__
__device__
__forceinline__
binary_negate
(
const
Predicate
&
p
)
:
pred
(
p
)
{}
__device__
__forceinline__
bool
operator
()(
typename
TypeTraits
<
typename
Predicate
::
first_argument_type
>::
ParameterType
x
,
typename
TypeTraits
<
typename
Predicate
::
second_argument_type
>::
ParameterType
y
)
const
{
return
!
pred
(
x
,
y
);
__device__
__forceinline__
bool
operator
()(
typename
TypeTraits
<
typename
Predicate
::
first_argument_type
>::
ParameterType
x
,
typename
TypeTraits
<
typename
Predicate
::
second_argument_type
>::
ParameterType
y
)
const
{
return
!
pred
(
x
,
y
);
}
__device__
__forceinline__
binary_negate
(
const
binary_negate
&
other
)
:
binary_function
<
typename
Predicate
::
first_argument_type
,
typename
Predicate
::
second_argument_type
,
bool
>
(){}
__device__
__forceinline__
binary_negate
()
:
binary_function
<
typename
Predicate
::
first_argument_type
,
typename
Predicate
::
second_argument_type
,
bool
>
(){}
const
Predicate
pred
;
};
template
<
typename
BinaryPredicate
>
__host__
__device__
__forceinline__
binary_negate
<
BinaryPredicate
>
not2
(
const
BinaryPredicate
&
pred
)
{
return
binary_negate
<
BinaryPredicate
>
(
pred
);
}
template
<
typename
Op
>
struct
binder1st
:
unary_function
<
typename
Op
::
second_argument_type
,
typename
Op
::
result_type
>
template
<
typename
Op
>
struct
binder1st
:
unary_function
<
typename
Op
::
second_argument_type
,
typename
Op
::
result_type
>
{
__host__
__device__
__forceinline__
binder1st
(
const
Op
&
op_
,
const
typename
Op
::
first_argument_type
&
arg1_
)
:
op
(
op_
),
arg1
(
arg1_
)
{}
...
...
@@ -555,15 +570,19 @@ namespace cv { namespace gpu { namespace device
return
op
(
arg1
,
a
);
}
__device__
__forceinline__
binder1st
(
const
binder1st
&
other
)
:
unary_function
<
typename
Op
::
second_argument_type
,
typename
Op
::
result_type
>
(){}
const
Op
op
;
const
typename
Op
::
first_argument_type
arg1
;
};
template
<
typename
Op
,
typename
T
>
__host__
__device__
__forceinline__
binder1st
<
Op
>
bind1st
(
const
Op
&
op
,
const
T
&
x
)
{
return
binder1st
<
Op
>
(
op
,
typename
Op
::
first_argument_type
(
x
));
}
template
<
typename
Op
>
struct
binder2nd
:
unary_function
<
typename
Op
::
first_argument_type
,
typename
Op
::
result_type
>
template
<
typename
Op
>
struct
binder2nd
:
unary_function
<
typename
Op
::
first_argument_type
,
typename
Op
::
result_type
>
{
__host__
__device__
__forceinline__
binder2nd
(
const
Op
&
op_
,
const
typename
Op
::
second_argument_type
&
arg2_
)
:
op
(
op_
),
arg2
(
arg2_
)
{}
...
...
@@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
return
op
(
a
,
arg2
);
}
__device__
__forceinline__
binder2nd
(
const
binder2nd
&
other
)
:
unary_function
<
typename
Op
::
first_argument_type
,
typename
Op
::
result_type
>
(),
op
(
other
.
op
),
arg2
(
other
.
arg2
){}
const
Op
op
;
const
typename
Op
::
second_argument_type
arg2
;
};
template
<
typename
Op
,
typename
T
>
__host__
__device__
__forceinline__
binder2nd
<
Op
>
bind2nd
(
const
Op
&
op
,
const
T
&
x
)
{
return
binder2nd
<
Op
>
(
op
,
typename
Op
::
second_argument_type
(
x
));
}
// Functor Traits
template
<
typename
F
>
struct
IsUnaryFunction
{
typedef
char
Yes
;
...
...
@@ -618,7 +640,7 @@ namespace cv { namespace gpu { namespace device
{
enum
{
shift
=
UnOpShift
<
sizeof
(
T
),
sizeof
(
D
)
>::
shift
};
};
template
<
size_t
src_elem_size1
,
size_t
src_elem_size2
,
size_t
dst_elem_size
>
struct
BinOpShift
{
enum
{
shift
=
1
};
};
template
<
size_t
src_elem_size1
,
size_t
src_elem_size2
>
struct
BinOpShift
<
src_elem_size1
,
src_elem_size2
,
1
>
{
enum
{
shift
=
4
};
};
template
<
size_t
src_elem_size1
,
size_t
src_elem_size2
>
struct
BinOpShift
<
src_elem_size1
,
src_elem_size2
,
2
>
{
enum
{
shift
=
2
};
};
...
...
modules/gpu/src/opencv2/gpu/device/limits.hpp
View file @
2777ebb8
...
...
@@ -46,7 +46,7 @@
#include <limits>
#include "common.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
class
T
>
struct
numeric_limits
{
...
...
modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
View file @
2777ebb8
...
...
@@ -57,35 +57,35 @@ namespace cv { namespace gpu { namespace device
template
<
typename
_Tp
>
__device__
__forceinline__
_Tp
saturate_cast
(
double
v
)
{
return
_Tp
(
v
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
schar
v
)
{
return
(
uchar
)
::
max
((
int
)
v
,
0
);
{
return
(
uchar
)
::
max
((
int
)
v
,
0
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
ushort
v
)
{
return
(
uchar
)
::
min
((
uint
)
v
,
(
uint
)
UCHAR_MAX
);
{
return
(
uchar
)
::
min
((
uint
)
v
,
(
uint
)
UCHAR_MAX
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
int
v
)
{
return
(
uchar
)((
uint
)
v
<=
UCHAR_MAX
?
v
:
v
>
0
?
UCHAR_MAX
:
0
);
{
return
(
uchar
)((
uint
)
v
<=
UCHAR_MAX
?
v
:
v
>
0
?
UCHAR_MAX
:
0
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
uint
v
)
{
return
(
uchar
)
::
min
(
v
,
(
uint
)
UCHAR_MAX
);
{
return
(
uchar
)
::
min
(
v
,
(
uint
)
UCHAR_MAX
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
short
v
)
{
return
saturate_cast
<
uchar
>
((
uint
)
v
);
{
return
saturate_cast
<
uchar
>
((
uint
)
v
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
double
v
)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
#else
return
saturate_cast
<
uchar
>
((
float
)
v
);
...
...
@@ -93,35 +93,35 @@ namespace cv { namespace gpu { namespace device
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
uchar
v
)
{
return
(
schar
)
::
min
((
int
)
v
,
SCHAR_MAX
);
{
return
(
schar
)
::
min
((
int
)
v
,
SCHAR_MAX
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
ushort
v
)
{
return
(
schar
)
::
min
((
uint
)
v
,
(
uint
)
SCHAR_MAX
);
{
return
(
schar
)
::
min
((
uint
)
v
,
(
uint
)
SCHAR_MAX
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
int
v
)
{
return
(
schar
)((
uint
)(
v
-
SCHAR_MIN
)
<=
(
uint
)
UCHAR_MAX
?
v
:
v
>
0
?
SCHAR_MAX
:
SCHAR_MIN
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
short
v
)
{
return
saturate_cast
<
schar
>
((
int
)
v
);
{
return
saturate_cast
<
schar
>
((
int
)
v
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
uint
v
)
{
return
(
schar
)
::
min
(
v
,
(
uint
)
SCHAR_MAX
);
{
return
(
schar
)
::
min
(
v
,
(
uint
)
SCHAR_MAX
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
double
v
)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
#else
return
saturate_cast
<
schar
>
((
float
)
v
);
...
...
@@ -129,30 +129,30 @@ namespace cv { namespace gpu { namespace device
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
schar
v
)
{
return
(
ushort
)
::
max
((
int
)
v
,
0
);
{
return
(
ushort
)
::
max
((
int
)
v
,
0
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
short
v
)
{
return
(
ushort
)
::
max
((
int
)
v
,
0
);
{
return
(
ushort
)
::
max
((
int
)
v
,
0
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
int
v
)
{
return
(
ushort
)((
uint
)
v
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
USHRT_MAX
:
0
);
{
return
(
ushort
)((
uint
)
v
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
USHRT_MAX
:
0
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
uint
v
)
{
return
(
ushort
)
::
min
(
v
,
(
uint
)
USHRT_MAX
);
{
return
(
ushort
)
::
min
(
v
,
(
uint
)
USHRT_MAX
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
double
v
)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
#else
return
saturate_cast
<
ushort
>
((
float
)
v
);
...
...
@@ -160,37 +160,37 @@ namespace cv { namespace gpu { namespace device
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
ushort
v
)
{
return
(
short
)
::
min
((
int
)
v
,
SHRT_MAX
);
{
return
(
short
)
::
min
((
int
)
v
,
SHRT_MAX
);
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
int
v
)
{
return
(
short
)((
uint
)(
v
-
SHRT_MIN
)
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
SHRT_MAX
:
SHRT_MIN
);
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
uint
v
)
{
return
(
short
)
::
min
(
v
,
(
uint
)
SHRT_MAX
);
{
return
(
short
)
::
min
(
v
,
(
uint
)
SHRT_MAX
);
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
double
v
)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
#else
return
saturate_cast
<
short
>
((
float
)
v
);
#endif
}
template
<>
__device__
__forceinline__
int
saturate_cast
<
int
>
(
float
v
)
{
return
__float2int_rn
(
v
);
template
<>
__device__
__forceinline__
int
saturate_cast
<
int
>
(
float
v
)
{
return
__float2int_rn
(
v
);
}
template
<>
__device__
__forceinline__
int
saturate_cast
<
int
>
(
double
v
)
template
<>
__device__
__forceinline__
int
saturate_cast
<
int
>
(
double
v
)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
return
__double2int_rn
(
v
);
...
...
@@ -200,11 +200,11 @@ namespace cv { namespace gpu { namespace device
}
template
<>
__device__
__forceinline__
uint
saturate_cast
<
uint
>
(
float
v
)
{
return
__float2uint_rn
(
v
);
{
return
__float2uint_rn
(
v
);
}
template
<>
__device__
__forceinline__
uint
saturate_cast
<
uint
>
(
double
v
)
{
template
<>
__device__
__forceinline__
uint
saturate_cast
<
uint
>
(
double
v
)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
return
__double2uint_rn
(
v
);
#else
...
...
modules/gpu/src/opencv2/gpu/device/scan.hpp
0 → 100644
View file @
2777ebb8
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_SCAN_HPP__
#define __OPENCV_GPU_SCAN_HPP__
enum
ScanKind
{
EXCLUSIVE
=
0
,
INCLUSIVE
=
1
};
template
<
ScanKind
Kind
,
typename
T
,
typename
F
>
struct
WarpScan
{
__device__
__forceinline__
WarpScan
()
{}
__device__
__forceinline__
WarpScan
(
const
WarpScan
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
,
const
unsigned
int
idx
)
{
const
unsigned
int
lane
=
idx
&
31
;
F
op
;
if
(
lane
>=
1
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
1
],
ptr
[
idx
]);
if
(
lane
>=
2
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
2
],
ptr
[
idx
]);
if
(
lane
>=
4
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
4
],
ptr
[
idx
]);
if
(
lane
>=
8
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
8
],
ptr
[
idx
]);
if
(
lane
>=
16
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
16
],
ptr
[
idx
]);
if
(
Kind
==
INCLUSIVE
)
return
ptr
[
idx
];
else
return
(
lane
>
0
)
?
ptr
[
idx
-
1
]
:
0
;
}
__device__
__forceinline__
unsigned
int
index
(
const
unsigned
int
tid
)
{
return
tid
;
}
__device__
__forceinline__
void
init
(
volatile
T
*
ptr
){}
static
const
int
warp_offset
=
0
;
typedef
WarpScan
<
INCLUSIVE
,
T
,
F
>
merge
;
};
template
<
ScanKind
Kind
,
typename
T
,
typename
F
>
struct
WarpScanNoComp
{
__device__
__forceinline__
WarpScanNoComp
()
{}
__device__
__forceinline__
WarpScanNoComp
(
const
WarpScanNoComp
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
,
const
unsigned
int
idx
)
{
const
unsigned
int
lane
=
threadIdx
.
x
&
31
;
F
op
;
ptr
[
idx
]
=
op
(
ptr
[
idx
-
1
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
2
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
4
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
8
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
16
],
ptr
[
idx
]);
if
(
Kind
==
INCLUSIVE
)
return
ptr
[
idx
];
else
return
(
lane
>
0
)
?
ptr
[
idx
-
1
]
:
0
;
}
__device__
__forceinline__
unsigned
int
index
(
const
unsigned
int
tid
)
{
return
(
tid
>>
warp_log
)
*
warp_smem_stride
+
16
+
(
tid
&
warp_mask
);
}
__device__
__forceinline__
void
init
(
volatile
T
*
ptr
)
{
ptr
[
threadIdx
.
x
]
=
0
;
}
static
const
int
warp_smem_stride
=
32
+
16
+
1
;
static
const
int
warp_offset
=
16
;
static
const
int
warp_log
=
5
;
static
const
int
warp_mask
=
31
;
typedef
WarpScanNoComp
<
INCLUSIVE
,
T
,
F
>
merge
;
};
template
<
ScanKind
Kind
,
typename
T
,
typename
Sc
,
typename
F
>
struct
BlockScan
{
__device__
__forceinline__
BlockScan
()
{}
__device__
__forceinline__
BlockScan
(
const
BlockScan
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
)
{
const
unsigned
int
tid
=
threadIdx
.
x
;
const
unsigned
int
lane
=
tid
&
warp_mask
;
const
unsigned
int
warp
=
tid
>>
warp_log
;
Sc
scan
;
typename
Sc
::
merge
merge_scan
;
const
unsigned
int
idx
=
scan
.
index
(
tid
);
T
val
=
scan
(
ptr
,
idx
);
__syncthreads
();
if
(
warp
==
0
)
scan
.
init
(
ptr
);
__syncthreads
();
if
(
lane
==
31
)
ptr
[
scan
.
warp_offset
+
warp
]
=
(
Kind
==
INCLUSIVE
)
?
val
:
ptr
[
idx
];
__syncthreads
();
if
(
warp
==
0
)
merge_scan
(
ptr
,
idx
);
__syncthreads
();
if
(
warp
>
0
)
val
=
ptr
[
scan
.
warp_offset
+
warp
-
1
]
+
val
;
__syncthreads
();
ptr
[
idx
]
=
val
;
__syncthreads
();
return
val
;
}
static
const
int
warp_log
=
5
;
static
const
int
warp_mask
=
31
;
};
#endif
\ No newline at end of file
modules/gpu/src/opencv2/gpu/device/static_check.hpp
View file @
2777ebb8
...
...
@@ -43,27 +43,27 @@
#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
#if defined(__CUDACC__)
#define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
#if defined(__CUDACC__)
#define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
#else
#define __OPENCV_GPU_HOST_DEVICE__
#endif
#endif
namespace
cv
{
namespace
gpu
{
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
bool
expr
>
struct
Static
{};
template
<>
struct
Static
<
true
>
{
__OPENCV_GPU_HOST_DEVICE__
static
void
check
()
{};
template
<>
struct
Static
<
true
>
{
__OPENCV_GPU_HOST_DEVICE__
static
void
check
()
{};
};
}
}
using
::
cv
::
gpu
::
device
::
Static
;
}}
#undef __OPENCV_GPU_HOST_DEVICE__
#endif
/* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
#endif
/* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
modules/gpu/src/opencv2/gpu/device/transform.hpp
View file @
2777ebb8
...
...
@@ -47,7 +47,7 @@
#include "utility.hpp"
#include "detail/transform_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
,
typename
D
,
typename
UnOp
,
typename
Mask
>
static
inline
void
transform
(
DevMem2D_
<
T
>
src
,
DevMem2D_
<
D
>
dst
,
UnOp
op
,
const
Mask
&
mask
,
cudaStream_t
stream
)
...
...
modules/gpu/src/opencv2/gpu/device/type_traits.hpp
View file @
2777ebb8
...
...
@@ -45,11 +45,11 @@
#include "detail/type_traits_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
>
struct
IsSimpleParameter
{
enum
{
value
=
type_traits_detail
::
IsIntegral
<
T
>::
value
||
type_traits_detail
::
IsFloat
<
T
>::
value
||
enum
{
value
=
type_traits_detail
::
IsIntegral
<
T
>::
value
||
type_traits_detail
::
IsFloat
<
T
>::
value
||
type_traits_detail
::
PointerTraits
<
typename
type_traits_detail
::
ReferenceTraits
<
T
>::
type
>::
value
};
};
...
...
@@ -65,16 +65,16 @@ namespace cv { namespace gpu { namespace device
enum
{
isVolatile
=
type_traits_detail
::
UnVolatile
<
T
>::
value
};
enum
{
isReference
=
type_traits_detail
::
ReferenceTraits
<
UnqualifiedType
>::
value
};
enum
{
isPointer
=
type_traits_detail
::
PointerTraits
<
typename
type_traits_detail
::
ReferenceTraits
<
UnqualifiedType
>::
type
>::
value
};
enum
{
isPointer
=
type_traits_detail
::
PointerTraits
<
typename
type_traits_detail
::
ReferenceTraits
<
UnqualifiedType
>::
type
>::
value
};
enum
{
isUnsignedInt
=
type_traits_detail
::
IsUnsignedIntegral
<
UnqualifiedType
>::
value
};
enum
{
isSignedInt
=
type_traits_detail
::
IsSignedIntergral
<
UnqualifiedType
>::
value
};
enum
{
isIntegral
=
type_traits_detail
::
IsIntegral
<
UnqualifiedType
>::
value
};
enum
{
isFloat
=
type_traits_detail
::
IsFloat
<
UnqualifiedType
>::
value
};
enum
{
isArith
=
isIntegral
||
isFloat
};
enum
{
isVec
=
type_traits_detail
::
IsVec
<
UnqualifiedType
>::
value
};
typedef
typename
type_traits_detail
::
Select
<
IsSimpleParameter
<
UnqualifiedType
>::
value
,
enum
{
isUnsignedInt
=
type_traits_detail
::
IsUnsignedIntegral
<
UnqualifiedType
>::
value
};
enum
{
isSignedInt
=
type_traits_detail
::
IsSignedIntergral
<
UnqualifiedType
>::
value
};
enum
{
isIntegral
=
type_traits_detail
::
IsIntegral
<
UnqualifiedType
>::
value
};
enum
{
isFloat
=
type_traits_detail
::
IsFloat
<
UnqualifiedType
>::
value
};
enum
{
isArith
=
isIntegral
||
isFloat
};
enum
{
isVec
=
type_traits_detail
::
IsVec
<
UnqualifiedType
>::
value
};
typedef
typename
type_traits_detail
::
Select
<
IsSimpleParameter
<
UnqualifiedType
>::
value
,
T
,
typename
type_traits_detail
::
AddParameterType
<
T
>::
type
>::
type
ParameterType
;
};
}}}
...
...
modules/gpu/src/opencv2/gpu/device/utility.hpp
View file @
2777ebb8
...
...
@@ -47,17 +47,17 @@
#include "datamov_utils.hpp"
#include "detail/utility_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
#define OPENCV_GPU_LOG_WARP_SIZE
(5)
#define OPENCV_GPU_WARP_SIZE
(1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_WARP_SIZE
(5)
#define OPENCV_GPU_WARP_SIZE
(1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
///////////////////////////////////////////////////////////////////////////////
// swap
template
<
typename
T
>
void
__device__
__host__
__forceinline__
swap
(
T
&
a
,
T
&
b
)
template
<
typename
T
>
void
__device__
__host__
__forceinline__
swap
(
T
&
a
,
T
&
b
)
{
const
T
temp
=
a
;
a
=
b
;
...
...
@@ -71,9 +71,9 @@ namespace cv { namespace gpu { namespace device
{
explicit
__host__
__device__
__forceinline__
SingleMask
(
PtrStepb
mask_
)
:
mask
(
mask_
)
{}
__host__
__device__
__forceinline__
SingleMask
(
const
SingleMask
&
mask_
)
:
mask
(
mask_
.
mask
){}
__device__
__forceinline__
bool
operator
()(
int
y
,
int
x
)
const
{
{
return
mask
.
ptr
(
y
)[
x
]
!=
0
;
}
...
...
@@ -82,13 +82,13 @@ namespace cv { namespace gpu { namespace device
struct
SingleMaskChannels
{
__host__
__device__
__forceinline__
SingleMaskChannels
(
PtrStepb
mask_
,
int
channels_
)
__host__
__device__
__forceinline__
SingleMaskChannels
(
PtrStepb
mask_
,
int
channels_
)
:
mask
(
mask_
),
channels
(
channels_
)
{}
__host__
__device__
__forceinline__
SingleMaskChannels
(
const
SingleMaskChannels
&
mask_
)
:
mask
(
mask_
.
mask
),
channels
(
mask_
.
channels
){}
__device__
__forceinline__
bool
operator
()(
int
y
,
int
x
)
const
{
{
return
mask
.
ptr
(
y
)[
x
/
channels
]
!=
0
;
}
...
...
@@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device
{
curMask
=
maskCollection
[
z
];
}
__device__
__forceinline__
bool
operator
()(
int
y
,
int
x
)
const
{
uchar
val
;
...
...
@@ -165,20 +165,20 @@ namespace cv { namespace gpu { namespace device
utility_detail
::
ReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
data
,
partial_reduction
,
tid
,
op
);
}
template
<
int
n
,
typename
T
,
typename
V
,
typename
Pred
>
template
<
int
n
,
typename
T
,
typename
V
,
typename
Pred
>
__device__
__forceinline__
void
reducePredVal
(
volatile
T
*
sdata
,
T
&
myData
,
V
*
sval
,
V
&
myVal
,
int
tid
,
const
Pred
&
pred
)
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
utility_detail
::
PredValReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal
,
sdata
,
sval
,
tid
,
pred
);
}
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
__device__
__forceinline__
void
reducePredVal2
(
volatile
T
*
sdata
,
T
&
myData
,
V1
*
sval1
,
V1
&
myVal1
,
V2
*
sval2
,
V2
&
myVal2
,
int
tid
,
const
Pred
&
pred
)
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
utility_detail
::
PredVal2ReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal1
,
myVal2
,
sdata
,
sval1
,
sval2
,
tid
,
pred
);
}
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
...
...
@@ -212,17 +212,17 @@ namespace cv { namespace gpu { namespace device
{
double
invdet
=
1.0
/
det
;
x
[
0
]
=
saturate_cast
<
T
>
(
invdet
*
x
[
0
]
=
saturate_cast
<
T
>
(
invdet
*
(
b
[
0
]
*
(
A
[
1
][
1
]
*
A
[
2
][
2
]
-
A
[
1
][
2
]
*
A
[
2
][
1
])
-
A
[
0
][
1
]
*
(
b
[
1
]
*
A
[
2
][
2
]
-
A
[
1
][
2
]
*
b
[
2
]
)
+
A
[
0
][
2
]
*
(
b
[
1
]
*
A
[
2
][
1
]
-
A
[
1
][
1
]
*
b
[
2
]
)));
x
[
1
]
=
saturate_cast
<
T
>
(
invdet
*
x
[
1
]
=
saturate_cast
<
T
>
(
invdet
*
(
A
[
0
][
0
]
*
(
b
[
1
]
*
A
[
2
][
2
]
-
A
[
1
][
2
]
*
b
[
2
]
)
-
b
[
0
]
*
(
A
[
1
][
0
]
*
A
[
2
][
2
]
-
A
[
1
][
2
]
*
A
[
2
][
0
])
+
A
[
0
][
2
]
*
(
A
[
1
][
0
]
*
b
[
2
]
-
b
[
1
]
*
A
[
2
][
0
])));
x
[
2
]
=
saturate_cast
<
T
>
(
invdet
*
x
[
2
]
=
saturate_cast
<
T
>
(
invdet
*
(
A
[
0
][
0
]
*
(
A
[
1
][
1
]
*
b
[
2
]
-
b
[
1
]
*
A
[
2
][
1
])
-
A
[
0
][
1
]
*
(
A
[
1
][
0
]
*
b
[
2
]
-
b
[
1
]
*
A
[
2
][
0
])
+
b
[
0
]
*
(
A
[
1
][
0
]
*
A
[
2
][
1
]
-
A
[
1
][
1
]
*
A
[
2
][
0
])));
...
...
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
View file @
2777ebb8
...
...
@@ -47,7 +47,7 @@
#include "functional.hpp"
#include "detail/vec_distance_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
>
struct
L1Dist
{
...
...
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
};
// calc distance between two vectors in global memory
template
<
int
THREAD_DIM
,
typename
Dist
,
typename
T1
,
typename
T2
>
template
<
int
THREAD_DIM
,
typename
Dist
,
typename
T1
,
typename
T2
>
__device__
void
calcVecDiffGlobal
(
const
T1
*
vec1
,
const
T2
*
vec2
,
int
len
,
Dist
&
dist
,
typename
Dist
::
result_type
*
smem
,
int
tid
)
{
for
(
int
i
=
tid
;
i
<
len
;
i
+=
THREAD_DIM
)
...
...
@@ -170,9 +170,9 @@ namespace cv { namespace gpu { namespace device
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
template
<
int
THREAD_DIM
,
int
MAX_LEN
,
bool
LEN_EQ_MAX_LEN
,
typename
Dist
,
typename
T1
,
typename
T2
>
__device__
__forceinline__
void
calcVecDiffCached
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
int
len
,
Dist
&
dist
,
typename
Dist
::
result_type
*
smem
,
int
tid
)
{
{
vec_distance_detail
::
VecDiffCachedCalculator
<
THREAD_DIM
,
MAX_LEN
,
LEN_EQ_MAX_LEN
>::
calc
(
vecCached
,
vecGlob
,
len
,
dist
,
tid
);
dist
.
reduceAll
<
THREAD_DIM
>
(
smem
,
tid
);
}
...
...
modules/gpu/src/opencv2/gpu/device/vec_math.hpp
View file @
2777ebb8
...
...
@@ -47,7 +47,7 @@
#include "vec_traits.hpp"
#include "functional.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
namespace
vec_math_detail
{
...
...
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
}
namespace
vec_math_detail
{
{
template
<
typename
T1
,
typename
T2
>
struct
BinOpTraits
{
typedef
int
argument_type
;
...
...
@@ -326,5 +326,5 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_VEC_OP
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
}}}
// namespace cv { namespace gpu { namespace device
#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
View file @
2777ebb8
...
...
@@ -45,7 +45,7 @@
#include "common.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
,
int
N
>
struct
TypeVec
;
...
...
@@ -219,18 +219,18 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
template
<>
struct
VecTraits
<
char
>
{
template
<>
struct
VecTraits
<
char
>
{
typedef
char
elem_type
;
enum
{
cn
=
1
};
enum
{
cn
=
1
};
static
__device__
__host__
__forceinline__
char
all
(
char
v
)
{
return
v
;}
static
__device__
__host__
__forceinline__
char
make
(
char
x
)
{
return
x
;}
static
__device__
__host__
__forceinline__
char
make
(
const
char
*
x
)
{
return
*
x
;}
};
template
<>
struct
VecTraits
<
schar
>
{
template
<>
struct
VecTraits
<
schar
>
{
typedef
schar
elem_type
;
enum
{
cn
=
1
};
enum
{
cn
=
1
};
static
__device__
__host__
__forceinline__
schar
all
(
schar
v
)
{
return
v
;}
static
__device__
__host__
__forceinline__
schar
make
(
schar
x
)
{
return
x
;}
static
__device__
__host__
__forceinline__
schar
make
(
const
schar
*
x
)
{
return
*
x
;}
...
...
modules/gpu/src/opencv2/gpu/device/warp.hpp
View file @
2777ebb8
...
...
@@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
#define __OPENCV_GPU_DEVICE_WARP_HPP__
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
struct
Warp
{
...
...
@@ -64,18 +64,18 @@ namespace cv { namespace gpu { namespace device
template
<
typename
It
,
typename
T
>
static
__device__
__forceinline__
void
fill
(
It
beg
,
It
end
,
const
T
&
value
)
{
{
for
(
It
t
=
beg
+
laneId
();
t
<
end
;
t
+=
STRIDE
)
*
t
=
value
;
}
}
template
<
typename
InIt
,
typename
OutIt
>
static
__device__
__forceinline__
OutIt
copy
(
InIt
beg
,
InIt
end
,
OutIt
out
)
{
{
for
(
InIt
t
=
beg
+
laneId
();
t
<
end
;
t
+=
STRIDE
,
out
+=
STRIDE
)
*
out
=
*
t
;
return
out
;
}
}
template
<
typename
InIt
,
typename
OutIt
,
class
UnOp
>
static
__device__
__forceinline__
OutIt
transform
(
InIt
beg
,
InIt
end
,
OutIt
out
,
UnOp
op
)
...
...
@@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device
{
unsigned
int
lane
=
laneId
();
InIt1
t1
=
beg1
+
lane
;
InIt1
t1
=
beg1
+
lane
;
InIt2
t2
=
beg2
+
lane
;
for
(;
t1
<
end1
;
t1
+=
STRIDE
,
t2
+=
STRIDE
,
out
+=
STRIDE
)
*
out
=
op
(
*
t1
,
*
t2
);
...
...
@@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
OutIt
,
typename
T
>
static
__device__
__forceinline__
void
yota
(
OutIt
beg
,
OutIt
end
,
T
value
)
{
unsigned
int
lane
=
laneId
();
unsigned
int
lane
=
laneId
();
value
+=
lane
;
for
(
OutIt
t
=
beg
+
lane
;
t
<
end
;
t
+=
STRIDE
,
value
+=
STRIDE
)
...
...
modules/gpu/src/resize.cpp
View file @
2777ebb8
...
...
@@ -44,7 +44,32 @@
#ifndef HAVE_CUDA
void
cv
::
gpu
::
resize
(
const
GpuMat
&
,
GpuMat
&
,
Size
,
double
,
double
,
int
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
dsize
,
double
fx
,
double
fy
,
int
interpolation
,
Stream
&
s
)
{
(
void
)
src
;
(
void
)
dst
;
(
void
)
dsize
;
(
void
)
fx
;
(
void
)
fy
;
(
void
)
interpolation
;
(
void
)
s
;
throw_nogpu
();
}
void
cv
::
gpu
::
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
GpuMat
&
buffer
,
Size
dsize
,
double
fx
,
double
fy
,
int
interpolation
,
Stream
&
s
)
{
(
void
)
src
;
(
void
)
dst
;
(
void
)
dsize
;
(
void
)
fx
;
(
void
)
fy
;
(
void
)
interpolation
;
(
void
)
buffer
;
(
void
)
s
;
throw_nogpu
();
}
#else // HAVE_CUDA
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment