Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
e7f6c4b7
Commit
e7f6c4b7
authored
Jun 20, 2012
by
Marina Kolpakova
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
scan operations are moved in separate header
parent
8748cbc2
Hide whitespace changes
Inline
Side-by-side
Showing
28 changed files
with
465 additions
and
721 deletions
+465
-721
OpenCVDetectCUDA.cmake
cmake/OpenCVDetectCUDA.cmake
+5
-5
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+0
-4
perf_imgproc.cpp
modules/gpu/perf/perf_imgproc.cpp
+3
-3
element_operations.cu
modules/gpu/src/cuda/element_operations.cu
+4
-4
matrix_reductions.cu
modules/gpu/src/cuda/matrix_reductions.cu
+1
-1
resize.cu
modules/gpu/src/cuda/resize.cu
+2
-363
split_merge.cu
modules/gpu/src/cuda/split_merge.cu
+18
-18
NCVBroxOpticalFlow.cu
modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+68
-68
NCV.cu
modules/gpu/src/nvidia/core/NCV.cu
+8
-8
common.hpp
modules/gpu/src/opencv2/gpu/device/common.hpp
+7
-7
datamov_utils.hpp
modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+10
-10
dynamic_smem.hpp
modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+1
-1
emulation.hpp
modules/gpu/src/opencv2/gpu/device/emulation.hpp
+9
-9
funcattrib.hpp
modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+4
-4
functional.hpp
modules/gpu/src/opencv2/gpu/device/functional.hpp
+39
-17
limits.hpp
modules/gpu/src/opencv2/gpu/device/limits.hpp
+1
-1
saturate_cast.hpp
modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+56
-56
scan.hpp
modules/gpu/src/opencv2/gpu/device/scan.hpp
+167
-0
static_check.hpp
modules/gpu/src/opencv2/gpu/device/static_check.hpp
+12
-12
transform.hpp
modules/gpu/src/opencv2/gpu/device/transform.hpp
+1
-1
type_traits.hpp
modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+11
-11
utility.hpp
modules/gpu/src/opencv2/gpu/device/utility.hpp
+16
-16
vec_distance.hpp
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+4
-4
vec_math.hpp
modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+4
-3
vec_traits.hpp
modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+7
-7
warp.hpp
modules/gpu/src/opencv2/gpu/device/warp.hpp
+7
-7
resize.cpp
modules/gpu/src/resize.cpp
+0
-42
test_resize.cpp
modules/gpu/test/test_resize.cpp
+0
-39
No files found.
cmake/OpenCVDetectCUDA.cmake
View file @
e7f6c4b7
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"2.8.3"
)
message
(
STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.
)
return
()
return
()
endif
()
find_package
(
CUDA 4.1
)
if
(
CUDA_FOUND
)
...
...
@@ -23,7 +23,7 @@ if(CUDA_FOUND)
else
()
set
(
CUDA_ARCH_BIN
"1.1 1.2 1.3 2.0 2.1(2.0)"
CACHE STRING
"Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported"
)
endif
()
set
(
CUDA_ARCH_PTX
"2.0"
CACHE STRING
"Specify 'virtual' PTX architectures to build PTX intermediate code for"
)
string
(
REGEX REPLACE
"
\\
."
""
ARCH_BIN_NO_POINTS
"
${
CUDA_ARCH_BIN
}
"
)
...
...
@@ -89,8 +89,8 @@ if(CUDA_FOUND)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
-Xcompiler -fno-finite-math-only
)
endif
()
# we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
set
(
CMAKE_CXX_FLAGS_DEBUG_
${
CMAKE_CXX_FLAGS_DEBUG
}
)
# we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
set
(
CMAKE_CXX_FLAGS_DEBUG_
${
CMAKE_CXX_FLAGS_DEBUG
}
)
string
(
REPLACE
"-ggdb3"
""
CMAKE_CXX_FLAGS_DEBUG
${
CMAKE_CXX_FLAGS_DEBUG
}
)
CUDA_COMPILE
(
${
VAR
}
${
ARGN
}
)
set
(
CMAKE_CXX_DEBUG_FLAGS
${
CMAKE_CXX_FLAGS_DEBUG_
}
)
...
...
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
e7f6c4b7
...
...
@@ -629,10 +629,6 @@ CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, doubl
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
CV_EXPORTS
void
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
dsize
,
double
fx
=
0
,
double
fy
=
0
,
int
interpolation
=
INTER_LINEAR
,
Stream
&
stream
=
Stream
::
Null
());
//! resizes the image
//! Supports INTER_AREA
CV_EXPORTS
void
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
GpuMat
&
buffer
,
Size
dsize
,
double
fx
=
0
,
double
fy
=
0
,
int
interpolation
=
INTER_AREA
,
Stream
&
stream
=
Stream
::
Null
());
//! warps the image using affine transformation
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
CV_EXPORTS
void
warpAffine
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
Mat
&
M
,
Size
dsize
,
int
flags
=
INTER_LINEAR
,
...
...
modules/gpu/perf/perf_imgproc.cpp
View file @
e7f6c4b7
...
...
@@ -118,10 +118,10 @@ GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
INSTANTIATE_TEST_CASE_P
(
ImgProc
,
ResizeArea
,
testing
::
Combine
(
ALL_DEVICES
,
testing
::
Values
(
perf
::
sz1080p
,
cv
::
Size
(
4096
,
2048
)
),
testing
::
Values
(
MatType
(
CV_8UC1
)
/*
, MatType(CV_8UC3), MatType(CV_8UC4),
testing
::
Values
(
perf
::
sz1080p
/*, cv::Size(4096, 2048)*/
),
testing
::
Values
(
MatType
(
CV_8UC1
),
MatType
(
CV_8UC3
),
MatType
(
CV_8UC4
),
MatType
(
CV_16UC1
),
MatType
(
CV_16UC3
),
MatType
(
CV_16UC4
),
MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)
*/
),
MatType
(
CV_32FC1
),
MatType
(
CV_32FC3
),
MatType
(
CV_32FC4
)),
testing
::
Values
(
Scale
(
0.2
),
Scale
(
0.1
),
Scale
(
0.05
))));
//////////////////////////////////////////////////////////////////////
...
...
modules/gpu/src/cuda/element_operations.cu
View file @
e7f6c4b7
...
...
@@ -1253,7 +1253,7 @@ namespace cv { namespace gpu { namespace device
{
const T val;
__host__ explicit CompareScalar(T val
) : val(val
) {}
__host__ explicit CompareScalar(T val
_) : val(val_
) {}
__device__ __forceinline__ uchar operator()(T src) const
{
...
...
@@ -1266,7 +1266,7 @@ namespace cv { namespace gpu { namespace device
{
const TYPE_VEC(T, 2) val;
__host__ explicit CompareScalar(TYPE_VEC(T, 2) val
) : val(val
) {}
__host__ explicit CompareScalar(TYPE_VEC(T, 2) val
_) : val(val_
) {}
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
{
...
...
@@ -1281,7 +1281,7 @@ namespace cv { namespace gpu { namespace device
{
const TYPE_VEC(T, 3) val;
__host__ explicit CompareScalar(TYPE_VEC(T, 3) val
) : val(val
) {}
__host__ explicit CompareScalar(TYPE_VEC(T, 3) val
_) : val(val_
) {}
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
{
...
...
@@ -1297,7 +1297,7 @@ namespace cv { namespace gpu { namespace device
{
const TYPE_VEC(T, 4) val;
__host__ explicit CompareScalar(TYPE_VEC(T, 4) val
) : val(val
) {}
__host__ explicit CompareScalar(TYPE_VEC(T, 4) val
_) : val(val_
) {}
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
{
...
...
modules/gpu/src/cuda/matrix_reductions.cu
View file @
e7f6c4b7
...
...
@@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device
struct Mask8U
{
explicit Mask8U(PtrStepb mask
): mask(mask
) {}
explicit Mask8U(PtrStepb mask
_): mask(mask_
) {}
__device__ __forceinline__ bool operator()(int y, int x) const
{
...
...
modules/gpu/src/cuda/resize.cu
View file @
e7f6c4b7
...
...
@@ -46,7 +46,8 @@
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp"
# include <cfloat>
#include <cfloat>
#include <opencv2/gpu/device/scan.hpp>
namespace cv { namespace gpu { namespace device
{
...
...
@@ -285,367 +286,5 @@ namespace cv { namespace gpu { namespace device
typedef float scan_line_type;
};
// template <typename T>
// __global__ void resize_area_scan(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy, DevMem2D_<T> buffer)
// {
// typedef typename scan_traits<T>::scan_line_type W;
// extern __shared__ W line[];
// const int x = threadIdx.x;
// const int y = blockIdx.x;
// if (y >= src.rows) return;
// int offset = 1;
// line[2 * x + 0] = src(y, 2 * x + 0);
// line[2 * x + 1] = src(y, 2 * x + 1);
// __syncthreads();//???
// // reduction
// for (int d = blockDim.x; d > 0; d >>= 1)
// {
// __syncthreads();
// if (x < d)
// {
// int ai = 2 * x * offset -1 + 1 * offset;
// int bi = 2 * x * offset -1 + 2 * offset;
// line[bi] += line[ai];
// }
// offset *= 2;
// }
// __syncthreads();
// // convolution
// if (x == 0) { line[(blockDim.x << 1) - 1] = 0; printf("offset: %d!!!!!!!!!!!!!\n", fx);}
// for (int d = 1; d < (blockDim.x << 1); d *= 2)
// {
// offset >>= 1;
// __syncthreads();
// if (x < d)
// {
// int ai = offset * 2 * x + 1 * offset - 1;
// int bi = offset * 2 * x + 2 * offset - 1;
// W t = line[ai];
// line[ai] = line[bi];
// line[bi] += t;
// }
// }
// __syncthreads();
// // calculate sum
// int start = 0;
// int out_idx = 0;
// int end = start + fx;
// while (start < (blockDim.x << 1) && end < (blockDim.x << 1))
// {
// buffer(y, out_idx) = saturate_cast<T>((line[end] - line[start]) / fx);
// start = end;
// end = start + fx;
// out_idx++;
// }
// }
template <typename T>
__device__ void scan_y(DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,int fx, int fy, DevMem2D_<T> dst,
typename scan_traits<T>::scan_line_type* line, int g_base)
{
typedef typename scan_traits<T>::scan_line_type W;
const int y = threadIdx.x;
const int x = blockIdx.x;
float scale = 1.f / (fx * fy);
if (x >= buffer.cols) return;
int offset = 1;
line[2 * y + 0] = buffer((g_base * fy) + 2 * y + 1, x);
if (y != (blockDim.x -1) )
line[2 * y + 1] = buffer((g_base * fy) + 2 * y + 2, x);
else
line[2 * y + 1] = 0;
__syncthreads();
// reduction
for (int d = blockDim.x; d > 0; d >>= 1)
{
__syncthreads();
if (y < d)
{
int ai = 2 * y * offset -1 + 1 * offset;
int bi = 2 * y * offset -1 + 2 * offset;
line[bi] += line[ai];
}
offset *= 2;
}
__syncthreads();
// convolution
if (y == 0) line[(blockDim.x << 1) - 1] = (W)buffer(0, x);
for (int d = 1; d < (blockDim.x << 1); d *= 2)
{
offset >>= 1;
__syncthreads();
if (y < d)
{
int ai = offset * 2 * y + 1 * offset - 1;
int bi = offset * 2 * y + 2 * offset - 1;
W t = line[ai];
line[ai] = line[bi];
line[bi] += t;
}
}
__syncthreads();
if (y < dst.rows)
{
W start = (y == 0)? (W)0:line[y * fy -1];
W end = line[y * fy + fy - 1];
dst(g_base + y ,x) = saturate_cast<T>((end - start) * scale);
}
}
template <typename T>
__device__ void scan_x(const DevMem2D_<T> src, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,
typename scan_traits<T>::scan_line_type* line, int g_base)
{
typedef typename scan_traits<T>::scan_line_type W;
const int x = threadIdx.x;
const int y = blockIdx.x;
float scale = 1.f / (fx * fy);
if (y >= src.rows) return;
int offset = 1;
line[2 * x + 0] = (W)src(y, (g_base * fx) + 2 * x + 1);
if (x != (blockDim.x -1) )
line[2 * x + 1] = (W)src(y, (g_base * fx) + 2 * x + 2);
else
line[2 * x + 1] = 0;
__syncthreads();
// reduction
for (int d = blockDim.x; d > 0; d >>= 1)
{
__syncthreads();
if (x < d)
{
int ai = 2 * x * offset -1 + 1 * offset;
int bi = 2 * x * offset -1 + 2 * offset;
line[bi] += line[ai];
}
offset *= 2;
}
__syncthreads();
// convolution
if (x == 0) line[(blockDim.x << 1) - 1] = (W)src(y, 0);
for (int d = 1; d < (blockDim.x << 1); d *= 2)
{
offset >>= 1;
__syncthreads();
if (x < d)
{
int ai = offset * 2 * x + 1 * offset - 1;
int bi = offset * 2 * x + 2 * offset - 1;
W t = line[ai];
line[ai] = line[bi];
line[bi] += t;
}
}
__syncthreads();
if (x < buffer.cols)
{
W start = (x == 0)? (W)0:line[x * fx -1];
W end = line[x * fx + fx - 1];
buffer(y, g_base + x) =(end - start);
}
}
enum ScanKind { exclusive, inclusive } ;
template <ScanKind Kind , class T>
__device__ __forceinline__ T scan_warp ( volatile T *ptr , const unsigned int idx = threadIdx.x )
{
const unsigned int lane = idx & 31;
if ( lane >= 1) ptr [idx ] = ptr [idx - 1] + ptr [idx];
if ( lane >= 2) ptr [idx ] = ptr [idx - 2] + ptr [idx];
if ( lane >= 4) ptr [idx ] = ptr [idx - 4] + ptr [idx];
if ( lane >= 8) ptr [idx ] = ptr [idx - 8] + ptr [idx];
if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx];
if( Kind == inclusive )
return ptr [idx ];
else
return (lane > 0) ? ptr [idx - 1] : 0;
}
template <ScanKind Kind , class T>
__device__ __forceinline__ T scan_block( volatile T *ptr)
{
const unsigned int idx = threadIdx.x;
const unsigned int lane = idx & 31;
const unsigned int warp = idx >> 5;
T val = scan_warp <Kind>( ptr , idx );
__syncthreads ();
if( lane == 31 )
ptr [ warp ] = ptr [idx ];
__syncthreads ();
if( warp == 0 )
scan_warp<inclusive>( ptr , idx );
__syncthreads ();
if ( warp > 0)
val = ptr [warp -1] + val;
__syncthreads ();
ptr[idx] = val;
__syncthreads ();
return val ;
}
template<typename T, typename W>
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride)
{
extern __shared__ W sbuf[];
const unsigned int tid = threadIdx. x;
// load line-block on shared memory
int y = blockIdx.x / thred_lines;
int input_stride = (blockIdx.x % thred_lines) * stride;
int x = input_stride + tid;
// store global data in shared memory
if (x < src.cols && y < src.rows)
sbuf[tid] = src(y, x);
else
sbuf[tid] = 0;
__syncthreads();
scan_block<inclusive, W>(sbuf);
float scale = __fdividef(1.f, fx);
int out_stride = input_stride / fx;
int count = blockDim.x / fx;
if (tid < count)
{
int start_idx = (tid == 0)? 0 : tid * fx - 1;
int end_idx = tid * fx + fx - 1;
W start = (tid == 0)? (W)0:sbuf[start_idx];
W end = sbuf[end_idx];
dst(y, out_stride + tid) = (end - start);
}
}
template<typename T, typename W>
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride)
{
extern __shared__ W sbuf[];
const unsigned int tid = threadIdx. x;
// load line-block on shared memory
int x = blockIdx.x / thred_lines;
int global_stride = (blockIdx.x % thred_lines) * stride;
int y = global_stride + tid;
// store global data in shared memory
if (x < src.cols && y < src.rows)
sbuf[tid] = src(y, x);
else
sbuf[tid] = 0;
__syncthreads();
scan_block<inclusive, W>(sbuf);
float scale = __fdividef(1.f, fx * fy);
int out_stride = global_stride / fx;
int count = blockDim.x / fx;
if (tid < count)
{
int start_idx = (tid == 0)? 0 : tid * fx - 1;
int end_idx = tid * fx + fx - 1;
W start = (tid == 0)? (W)0:sbuf[start_idx];
W end = sbuf[end_idx];
dst(out_stride + tid, x) = saturate_cast<T>((end - start) * scale);
}
}
template <typename T>
void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
int interpolation, DevMem2Df buffer, cudaStream_t stream)
{
(void)interpolation;
int iscale_x = round(fx);
int iscale_y = round(fy);
int warps = 4;
const int threads = 32 * warps;
int input_stride = threads / iscale_x;
int thred_lines = divUp(src.cols, input_stride * iscale_x);
int blocks = src.rows * thred_lines;
typedef typename scan_traits<T>::scan_line_type smem_type;
resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
(src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x);
input_stride = threads / iscale_y;
thred_lines = divUp(src.rows, input_stride * iscale_y);
blocks = dst.cols * thred_lines;
resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
(buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device
modules/gpu/src/cuda/split_merge.cu
View file @
e7f6c4b7
...
...
@@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC2_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC2_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
...
...
@@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC3_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC3_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
...
...
@@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC4_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC4_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
...
...
@@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC2_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC2_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
...
...
@@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC3_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC3_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
...
...
@@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC4_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC4_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
...
...
modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
View file @
e7f6c4b7
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
...
...
@@ -129,9 +129,9 @@ texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
__global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
{
const int pos = blockIdx.x*blockDim.x + threadIdx.x;
if(pos >= len) return;
d_res[pos] = d_op1[pos] + d_op2[pos];
}
...
...
@@ -265,7 +265,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo
///////////////////////////////////////////////////////////////////////////////
template<int tex_id>
__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
{
{
//position within shared memory array
const int ijs = js * PSOR_PITCH + is;
//mirror reflection across borders
...
...
@@ -299,7 +299,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
///\param h number of rows in global memory array
///\param p global memory array pitch in floats
///////////////////////////////////////////////////////////////////////////////
template<int tex>
template<int tex>
__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
{
const int i = threadIdx.x + 2;
...
...
@@ -381,7 +381,7 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
/// \param gamma (in) gamma in Brox model (edge importance)
///////////////////////////////////////////////////////////////////////////////
__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
float *denominator_u, float *denominator_v,
float *numerator_dudv,
float *numerator_u, float *numerator_v,
...
...
@@ -532,16 +532,16 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
// Red-Black SOR
/////////////////////////////////////////////////////////////////////////////////////////
template<int isBlack> __global__ void sor_pass(float *new_du,
float *new_dv,
const float *g_inv_denominator_u,
template<int isBlack> __global__ void sor_pass(float *new_du,
float *new_dv,
const float *g_inv_denominator_u,
const float *g_inv_denominator_v,
const float *g_numerator_u,
const float *g_numerator_v,
const float *g_numerator_dudv,
float omega,
int width,
int height,
const float *g_numerator_u,
const float *g_numerator_v,
const float *g_numerator_dudv,
float omega,
int width,
int height,
int stride)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
...
...
@@ -604,7 +604,7 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
if((i+j)%2 == isBlack)
{
// update du
float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
...
...
@@ -644,7 +644,7 @@ void InitTextures()
initTexture2D(tex_I1);
initTexture2D(tex_fine); // for downsampling
initTexture2D(tex_coarse); // for prolongation
initTexture2D(tex_Ix);
initTexture2D(tex_Ixx);
initTexture2D(tex_Ix0);
...
...
@@ -725,7 +725,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
const Ncv32u kSourceHeight = frame0.height();
ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
...
...
@@ -780,7 +780,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
// temporary storage
SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));
// image derivatives
...
...
@@ -800,7 +800,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
{
const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
InitTextures();
...
...
@@ -827,10 +827,10 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
size_t src_width_in_bytes = kSourceWidth * sizeof(float);
size_t src_pitch_in_bytes = frame0.pitch();
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
}
...
...
@@ -876,11 +876,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
NcvRect32u dstROI (0, 0, level_width, level_height);
// frame 0
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
// frame 1
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
}
...
...
@@ -956,14 +956,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
dim3 dThreads(32, 6);
const int kPitchTex = kLevelStride * sizeof(float);
NcvSize32u srcSize(kLevelWidth, kLevelHeight);
Ncv32u nSrcStep = kLevelStride * sizeof(float);
NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
// Ix0
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
// Iy0
ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
...
...
@@ -987,8 +987,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
// Ixy
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
...
...
@@ -1017,21 +1017,21 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
{
//compute coefficients
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
(diffusivity_x.ptr(),
diffusivity_y.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_dudv.ptr(),
num_u.ptr(),
num_v.ptr(),
kLevelWidth,
kLevelHeight,
kLevelStride,
alpha,
(diffusivity_x.ptr(),
diffusivity_y.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_dudv.ptr(),
num_u.ptr(),
num_v.ptr(),
kLevelWidth,
kLevelHeight,
kLevelStride,
alpha,
gamma);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
...
...
@@ -1043,7 +1043,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
// linear system coefficients
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
...
...
@@ -1055,26 +1055,26 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
//solve linear system
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
{
float omega = 1.99f;
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
(du_new.ptr(),
dv_new.ptr(),
denom_u.ptr(),
(du_new.ptr(),
dv_new.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
...
...
@@ -1083,16 +1083,16 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
(du.ptr(),
dv.ptr(),
denom_u.ptr(),
(du.ptr(),
dv.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
...
...
@@ -1120,19 +1120,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
dim3 p_threads(32, 8);
NcvSize32u srcSize (kLevelWidth, kLevelHeight);
NcvSize32u
inner_
srcSize (kLevelWidth, kLevelHeight);
NcvSize32u dstSize (nw, nh);
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
NcvRect32u dstROI (0, 0, nw, nh);
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(),
srcSize, kLevelStride * sizeof (float), srcROI,
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(),
inner_srcSize, kLevelStride * sizeof (float), srcROI,
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(),
srcSize, kLevelStride * sizeof (float), srcROI,
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(),
inner_srcSize, kLevelStride * sizeof (float), srcROI,
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
...
...
@@ -1148,11 +1148,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn( cudaMemcpy2DAsync
(uOut.ptr(), uOut.pitch(), ptrU->ptr(),
(uOut.ptr(), uOut.pitch(), ptrU->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn( cudaMemcpy2DAsync
(vOut.ptr(), vOut.pitch(), ptrV->ptr(),
(vOut.ptr(), vOut.pitch(), ptrV->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
...
...
modules/gpu/src/nvidia/core/NCV.cu
View file @
e7f6c4b7
...
...
@@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
//===================================================================
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment
_
)
:
currentSize(0),
_maxSize(0),
...
...
@@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
begin(NULL),
end(NULL),
_memType(NCVMemoryTypeNone),
_alignment(alignment),
_alignment(alignment
_
),
bReusesMemory(false)
{
NcvBool bProperAlignment = (alignment
& (alignment-
1)) == 0;
NcvBool bProperAlignment = (alignment
_ & (alignment_ -
1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
}
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment
_
, void *reusePtr)
:
currentSize(0),
_maxSize(0),
allocBegin(NULL),
_memType(memT),
_alignment(alignment)
_alignment(alignment
_
)
{
NcvBool bProperAlignment = (alignment
& (alignment-
1)) == 0;
NcvBool bProperAlignment = (alignment
_ & (alignment_ -
1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
...
...
@@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
//===================================================================
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment
_
)
:
currentSize(0),
_maxSize(0),
_memType(memT),
_alignment(alignment)
_alignment(alignment
_
)
{
ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
}
...
...
modules/gpu/src/opencv2/gpu/device/common.hpp
View file @
e7f6c4b7
...
...
@@ -64,7 +64,7 @@
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)
#endif
namespace
cv
{
namespace
gpu
namespace
cv
{
namespace
gpu
{
void
error
(
const
char
*
error_string
,
const
char
*
file
,
const
int
line
,
const
char
*
func
);
...
...
@@ -87,14 +87,14 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
#ifdef __CUDACC__
namespace
cv
{
namespace
gpu
{
__host__
__device__
__forceinline__
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
namespace
cv
{
namespace
gpu
{
__host__
__device__
__forceinline__
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
}
namespace
device
namespace
device
{
typedef
unsigned
char
uchar
;
typedef
unsigned
short
ushort
;
...
...
modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
View file @
e7f6c4b7
...
...
@@ -45,7 +45,7 @@
#include "common.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
...
...
@@ -54,13 +54,13 @@ namespace cv { namespace gpu { namespace device
{
__device__
__forceinline__
static
void
Load
(
const
T
*
ptr
,
int
offset
,
T
&
val
)
{
val
=
ptr
[
offset
];
}
};
#else // __CUDA_ARCH__ >= 200
#if defined(_WIN64) || defined(__LP64__)
#else // __CUDA_ARCH__ >= 200
#if defined(_WIN64) || defined(__LP64__)
// 64-bit register modifier for inlined asm
#define OPENCV_GPU_ASM_PTR "l"
#else
#else
// 32-bit register modifier for inlined asm
#define OPENCV_GPU_ASM_PTR "r"
#endif
...
...
@@ -84,21 +84,21 @@ namespace cv { namespace gpu { namespace device
asm
(
"ld.global."
#
ptx_type
" %0, [%1];"
:
"=r"
(
*
reinterpret_cast
<
uint
*>
(
&
val
))
:
OPENCV_GPU_ASM_PTR
(
ptr
+
offset
));
\
}
\
};
OPENCV_GPU_DEFINE_FORCE_GLOB_B
(
uchar
,
u8
)
OPENCV_GPU_DEFINE_FORCE_GLOB_B
(
schar
,
s8
)
OPENCV_GPU_DEFINE_FORCE_GLOB_B
(
char
,
b8
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
ushort
,
u16
,
h
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
short
,
s16
,
h
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
uint
,
u32
,
r
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
int
,
s32
,
r
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
float
,
f32
,
f
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
double
,
f64
,
d
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
int
,
s32
,
r
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
float
,
f32
,
f
)
OPENCV_GPU_DEFINE_FORCE_GLOB
(
double
,
f64
,
d
)
#undef OPENCV_GPU_DEFINE_FORCE_GLOB
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
#undef OPENCV_GPU_ASM_PTR
#endif // __CUDA_ARCH__ >= 200
}}}
// namespace cv { namespace gpu { namespace device
...
...
modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
View file @
e7f6c4b7
...
...
@@ -44,7 +44,7 @@
#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
namespace
cv
{
namespace
gpu
{
namespace
device
{
{
template
<
class
T
>
struct
DynamicSharedMem
{
__device__
__forceinline__
operator
T
*
()
...
...
modules/gpu/src/opencv2/gpu/device/emulation.hpp
View file @
e7f6c4b7
...
...
@@ -45,21 +45,21 @@
#include "warp_reduce.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
struct
Emulation
{
static
__forceinline__
__device__
int
Ballot
(
int
predicate
,
volatile
int
*
cta_buffer
)
{
static
__forceinline__
__device__
int
Ballot
(
int
predicate
,
volatile
int
*
cta_buffer
)
{
#if __CUDA_ARCH__ >= 200
(
void
)
cta_buffer
;
return
__ballot
(
predicate
);
(
void
)
cta_buffer
;
return
__ballot
(
predicate
);
#else
int
tid
=
threadIdx
.
x
;
cta_buffer
[
tid
]
=
predicate
?
(
1
<<
(
tid
&
31
))
:
0
;
return
warp_reduce
(
cta_buffer
);
int
tid
=
threadIdx
.
x
;
cta_buffer
[
tid
]
=
predicate
?
(
1
<<
(
tid
&
31
))
:
0
;
return
warp_reduce
(
cta_buffer
);
#endif
}
}
};
}}}
// namespace cv { namespace gpu { namespace device
...
...
modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
View file @
e7f6c4b7
...
...
@@ -46,14 +46,14 @@
#include <cstdio>
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
class
Func
>
template
<
class
Func
>
void
printFuncAttrib
(
Func
&
func
)
{
cudaFuncAttributes
attrs
;
cudaFuncGetAttributes
(
&
attrs
,
func
);
cudaFuncGetAttributes
(
&
attrs
,
func
);
printf
(
"=== Function stats ===
\n
"
);
printf
(
"Name:
\n
"
);
...
...
@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
printf
(
"ptxVersion = %d
\n
"
,
attrs
.
ptxVersion
);
printf
(
"binaryVersion = %d
\n
"
,
attrs
.
binaryVersion
);
printf
(
"
\n
"
);
fflush
(
stdout
);
fflush
(
stdout
);
}
}}}
// namespace cv { namespace gpu { namespace device
...
...
modules/gpu/src/opencv2/gpu/device/functional.hpp
View file @
e7f6c4b7
...
...
@@ -48,7 +48,7 @@
#include "vec_traits.hpp"
#include "type_traits.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
// Function Objects
...
...
@@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
struct
bit_not
:
unary_function
<
T
,
T
>
{
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
v
)
const
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
v
)
const
{
return
~
v
;
}
...
...
@@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device
// Generalized Identity Operations
template
<
typename
T
>
struct
identity
:
unary_function
<
T
,
T
>
{
__device__
__forceinline__
typename
TypeTraits
<
T
>::
ParameterType
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
x
)
const
__device__
__forceinline__
typename
TypeTraits
<
T
>::
ParameterType
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
x
)
const
{
return
x
;
}
...
...
@@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T1
,
typename
T2
>
struct
project1st
:
binary_function
<
T1
,
T2
,
T1
>
{
__device__
__forceinline__
typename
TypeTraits
<
T1
>::
ParameterType
operator
()(
typename
TypeTraits
<
T1
>::
ParameterType
lhs
,
typename
TypeTraits
<
T2
>::
ParameterType
rhs
)
const
__device__
__forceinline__
typename
TypeTraits
<
T1
>::
ParameterType
operator
()(
typename
TypeTraits
<
T1
>::
ParameterType
lhs
,
typename
TypeTraits
<
T2
>::
ParameterType
rhs
)
const
{
return
lhs
;
}
...
...
@@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T1
,
typename
T2
>
struct
project2nd
:
binary_function
<
T1
,
T2
,
T2
>
{
__device__
__forceinline__
typename
TypeTraits
<
T2
>::
ParameterType
operator
()(
typename
TypeTraits
<
T1
>::
ParameterType
lhs
,
typename
TypeTraits
<
T2
>::
ParameterType
rhs
)
const
__device__
__forceinline__
typename
TypeTraits
<
T2
>::
ParameterType
operator
()(
typename
TypeTraits
<
T1
>::
ParameterType
lhs
,
typename
TypeTraits
<
T2
>::
ParameterType
rhs
)
const
{
return
rhs
;
}
...
...
@@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
struct
maximum
:
binary_function
<
T
,
T
,
T
>
{
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
lhs
,
typename
TypeTraits
<
T
>::
ParameterType
rhs
)
const
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
lhs
,
typename
TypeTraits
<
T
>::
ParameterType
rhs
)
const
{
return
lhs
<
rhs
?
rhs
:
lhs
;
}
...
...
@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
struct
minimum
:
binary_function
<
T
,
T
,
T
>
{
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
lhs
,
typename
TypeTraits
<
T
>::
ParameterType
rhs
)
const
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
lhs
,
typename
TypeTraits
<
T
>::
ParameterType
rhs
)
const
{
return
lhs
<
rhs
?
lhs
:
rhs
;
}
...
...
@@ -410,12 +410,14 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
template
<
typename
T
>
struct
hypot_sqr_func
:
binary_function
<
T
,
T
,
float
>
template
<
typename
T
>
struct
hypot_sqr_func
:
binary_function
<
T
,
T
,
float
>
{
__device__
__forceinline__
T
operator
()(
typename
TypeTraits
<
T
>::
ParameterType
src1
,
typename
TypeTraits
<
T
>::
ParameterType
src2
)
const
{
return
src1
*
src1
+
src2
*
src2
;
}
__device__
__forceinline__
hypot_sqr_func
(
const
hypot_sqr_func
&
other
)
:
binary_function
<
T
,
T
,
float
>
(){}
__device__
__forceinline__
hypot_sqr_func
()
:
binary_function
<
T
,
T
,
float
>
(){}
};
// Saturate Cast Functor
...
...
@@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
{
return
(
src
>
thresh
)
*
maxVal
;
}
__device__
__forceinline__
thresh_binary_func
(
const
thresh_binary_func
&
other
)
:
unary_function
<
T
,
T
>
(),
thresh
(
other
.
thresh
),
maxVal
(
other
.
maxVal
){}
...
...
@@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
{
return
(
src
<=
thresh
)
*
maxVal
;
}
__device__
__forceinline__
thresh_binary_inv_func
(
const
thresh_binary_inv_func
&
other
)
:
unary_function
<
T
,
T
>
(),
thresh
(
other
.
thresh
),
maxVal
(
other
.
maxVal
){}
...
...
@@ -519,12 +523,16 @@ namespace cv { namespace gpu { namespace device
explicit
__host__
__device__
__forceinline__
unary_negate
(
const
Predicate
&
p
)
:
pred
(
p
)
{}
__device__
__forceinline__
bool
operator
()(
typename
TypeTraits
<
typename
Predicate
::
argument_type
>::
ParameterType
x
)
const
{
return
!
pred
(
x
);
{
return
!
pred
(
x
);
}
__device__
__forceinline__
unary_negate
(
const
unary_negate
&
other
)
:
unary_function
<
typename
Predicate
::
argument_type
,
bool
>
(){}
__device__
__forceinline__
unary_negate
()
:
unary_function
<
typename
Predicate
::
argument_type
,
bool
>
(){}
const
Predicate
pred
;
};
template
<
typename
Predicate
>
__host__
__device__
__forceinline__
unary_negate
<
Predicate
>
not1
(
const
Predicate
&
pred
)
{
return
unary_negate
<
Predicate
>
(
pred
);
...
...
@@ -534,19 +542,26 @@ namespace cv { namespace gpu { namespace device
{
explicit
__host__
__device__
__forceinline__
binary_negate
(
const
Predicate
&
p
)
:
pred
(
p
)
{}
__device__
__forceinline__
bool
operator
()(
typename
TypeTraits
<
typename
Predicate
::
first_argument_type
>::
ParameterType
x
,
typename
TypeTraits
<
typename
Predicate
::
second_argument_type
>::
ParameterType
y
)
const
{
return
!
pred
(
x
,
y
);
__device__
__forceinline__
bool
operator
()(
typename
TypeTraits
<
typename
Predicate
::
first_argument_type
>::
ParameterType
x
,
typename
TypeTraits
<
typename
Predicate
::
second_argument_type
>::
ParameterType
y
)
const
{
return
!
pred
(
x
,
y
);
}
__device__
__forceinline__
binary_negate
(
const
binary_negate
&
other
)
:
binary_function
<
typename
Predicate
::
first_argument_type
,
typename
Predicate
::
second_argument_type
,
bool
>
(){}
__device__
__forceinline__
binary_negate
()
:
binary_function
<
typename
Predicate
::
first_argument_type
,
typename
Predicate
::
second_argument_type
,
bool
>
(){}
const
Predicate
pred
;
};
template
<
typename
BinaryPredicate
>
__host__
__device__
__forceinline__
binary_negate
<
BinaryPredicate
>
not2
(
const
BinaryPredicate
&
pred
)
{
return
binary_negate
<
BinaryPredicate
>
(
pred
);
}
template
<
typename
Op
>
struct
binder1st
:
unary_function
<
typename
Op
::
second_argument_type
,
typename
Op
::
result_type
>
template
<
typename
Op
>
struct
binder1st
:
unary_function
<
typename
Op
::
second_argument_type
,
typename
Op
::
result_type
>
{
__host__
__device__
__forceinline__
binder1st
(
const
Op
&
op_
,
const
typename
Op
::
first_argument_type
&
arg1_
)
:
op
(
op_
),
arg1
(
arg1_
)
{}
...
...
@@ -555,15 +570,19 @@ namespace cv { namespace gpu { namespace device
return
op
(
arg1
,
a
);
}
__device__
__forceinline__
binder1st
(
const
binder1st
&
other
)
:
unary_function
<
typename
Op
::
second_argument_type
,
typename
Op
::
result_type
>
(){}
const
Op
op
;
const
typename
Op
::
first_argument_type
arg1
;
};
template
<
typename
Op
,
typename
T
>
__host__
__device__
__forceinline__
binder1st
<
Op
>
bind1st
(
const
Op
&
op
,
const
T
&
x
)
{
return
binder1st
<
Op
>
(
op
,
typename
Op
::
first_argument_type
(
x
));
}
template
<
typename
Op
>
struct
binder2nd
:
unary_function
<
typename
Op
::
first_argument_type
,
typename
Op
::
result_type
>
template
<
typename
Op
>
struct
binder2nd
:
unary_function
<
typename
Op
::
first_argument_type
,
typename
Op
::
result_type
>
{
__host__
__device__
__forceinline__
binder2nd
(
const
Op
&
op_
,
const
typename
Op
::
second_argument_type
&
arg2_
)
:
op
(
op_
),
arg2
(
arg2_
)
{}
...
...
@@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
return
op
(
a
,
arg2
);
}
__device__
__forceinline__
binder2nd
(
const
binder2nd
&
other
)
:
unary_function
<
typename
Op
::
first_argument_type
,
typename
Op
::
result_type
>
(),
op
(
other
.
op
),
arg2
(
other
.
arg2
){}
const
Op
op
;
const
typename
Op
::
second_argument_type
arg2
;
};
template
<
typename
Op
,
typename
T
>
__host__
__device__
__forceinline__
binder2nd
<
Op
>
bind2nd
(
const
Op
&
op
,
const
T
&
x
)
{
return
binder2nd
<
Op
>
(
op
,
typename
Op
::
second_argument_type
(
x
));
}
// Functor Traits
template
<
typename
F
>
struct
IsUnaryFunction
{
typedef
char
Yes
;
...
...
@@ -618,7 +640,7 @@ namespace cv { namespace gpu { namespace device
{
enum
{
shift
=
UnOpShift
<
sizeof
(
T
),
sizeof
(
D
)
>::
shift
};
};
template
<
size_t
src_elem_size1
,
size_t
src_elem_size2
,
size_t
dst_elem_size
>
struct
BinOpShift
{
enum
{
shift
=
1
};
};
template
<
size_t
src_elem_size1
,
size_t
src_elem_size2
>
struct
BinOpShift
<
src_elem_size1
,
src_elem_size2
,
1
>
{
enum
{
shift
=
4
};
};
template
<
size_t
src_elem_size1
,
size_t
src_elem_size2
>
struct
BinOpShift
<
src_elem_size1
,
src_elem_size2
,
2
>
{
enum
{
shift
=
2
};
};
...
...
modules/gpu/src/opencv2/gpu/device/limits.hpp
View file @
e7f6c4b7
...
...
@@ -46,7 +46,7 @@
#include <limits>
#include "common.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
class
T
>
struct
numeric_limits
{
...
...
modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
View file @
e7f6c4b7
...
...
@@ -57,35 +57,35 @@ namespace cv { namespace gpu { namespace device
template
<
typename
_Tp
>
__device__
__forceinline__
_Tp
saturate_cast
(
double
v
)
{
return
_Tp
(
v
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
schar
v
)
{
return
(
uchar
)
::
max
((
int
)
v
,
0
);
{
return
(
uchar
)
::
max
((
int
)
v
,
0
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
ushort
v
)
{
return
(
uchar
)
::
min
((
uint
)
v
,
(
uint
)
UCHAR_MAX
);
{
return
(
uchar
)
::
min
((
uint
)
v
,
(
uint
)
UCHAR_MAX
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
int
v
)
{
return
(
uchar
)((
uint
)
v
<=
UCHAR_MAX
?
v
:
v
>
0
?
UCHAR_MAX
:
0
);
{
return
(
uchar
)((
uint
)
v
<=
UCHAR_MAX
?
v
:
v
>
0
?
UCHAR_MAX
:
0
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
uint
v
)
{
return
(
uchar
)
::
min
(
v
,
(
uint
)
UCHAR_MAX
);
{
return
(
uchar
)
::
min
(
v
,
(
uint
)
UCHAR_MAX
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
short
v
)
{
return
saturate_cast
<
uchar
>
((
uint
)
v
);
{
return
saturate_cast
<
uchar
>
((
uint
)
v
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
}
template
<>
__device__
__forceinline__
uchar
saturate_cast
<
uchar
>
(
double
v
)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
#else
return
saturate_cast
<
uchar
>
((
float
)
v
);
...
...
@@ -93,35 +93,35 @@ namespace cv { namespace gpu { namespace device
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
uchar
v
)
{
return
(
schar
)
::
min
((
int
)
v
,
SCHAR_MAX
);
{
return
(
schar
)
::
min
((
int
)
v
,
SCHAR_MAX
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
ushort
v
)
{
return
(
schar
)
::
min
((
uint
)
v
,
(
uint
)
SCHAR_MAX
);
{
return
(
schar
)
::
min
((
uint
)
v
,
(
uint
)
SCHAR_MAX
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
int
v
)
{
return
(
schar
)((
uint
)(
v
-
SCHAR_MIN
)
<=
(
uint
)
UCHAR_MAX
?
v
:
v
>
0
?
SCHAR_MAX
:
SCHAR_MIN
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
short
v
)
{
return
saturate_cast
<
schar
>
((
int
)
v
);
{
return
saturate_cast
<
schar
>
((
int
)
v
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
uint
v
)
{
return
(
schar
)
::
min
(
v
,
(
uint
)
SCHAR_MAX
);
{
return
(
schar
)
::
min
(
v
,
(
uint
)
SCHAR_MAX
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
}
template
<>
__device__
__forceinline__
schar
saturate_cast
<
schar
>
(
double
v
)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
#else
return
saturate_cast
<
schar
>
((
float
)
v
);
...
...
@@ -129,30 +129,30 @@ namespace cv { namespace gpu { namespace device
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
schar
v
)
{
return
(
ushort
)
::
max
((
int
)
v
,
0
);
{
return
(
ushort
)
::
max
((
int
)
v
,
0
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
short
v
)
{
return
(
ushort
)
::
max
((
int
)
v
,
0
);
{
return
(
ushort
)
::
max
((
int
)
v
,
0
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
int
v
)
{
return
(
ushort
)((
uint
)
v
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
USHRT_MAX
:
0
);
{
return
(
ushort
)((
uint
)
v
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
USHRT_MAX
:
0
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
uint
v
)
{
return
(
ushort
)
::
min
(
v
,
(
uint
)
USHRT_MAX
);
{
return
(
ushort
)
::
min
(
v
,
(
uint
)
USHRT_MAX
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
}
template
<>
__device__
__forceinline__
ushort
saturate_cast
<
ushort
>
(
double
v
)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
#else
return
saturate_cast
<
ushort
>
((
float
)
v
);
...
...
@@ -160,37 +160,37 @@ namespace cv { namespace gpu { namespace device
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
ushort
v
)
{
return
(
short
)
::
min
((
int
)
v
,
SHRT_MAX
);
{
return
(
short
)
::
min
((
int
)
v
,
SHRT_MAX
);
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
int
v
)
{
return
(
short
)((
uint
)(
v
-
SHRT_MIN
)
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
SHRT_MAX
:
SHRT_MIN
);
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
uint
v
)
{
return
(
short
)
::
min
(
v
,
(
uint
)
SHRT_MAX
);
{
return
(
short
)
::
min
(
v
,
(
uint
)
SHRT_MAX
);
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
}
template
<>
__device__
__forceinline__
short
saturate_cast
<
short
>
(
double
v
)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
#else
return
saturate_cast
<
short
>
((
float
)
v
);
#endif
}
template
<>
__device__
__forceinline__
int
saturate_cast
<
int
>
(
float
v
)
{
return
__float2int_rn
(
v
);
template
<>
__device__
__forceinline__
int
saturate_cast
<
int
>
(
float
v
)
{
return
__float2int_rn
(
v
);
}
template
<>
__device__
__forceinline__
int
saturate_cast
<
int
>
(
double
v
)
template
<>
__device__
__forceinline__
int
saturate_cast
<
int
>
(
double
v
)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
return
__double2int_rn
(
v
);
...
...
@@ -200,11 +200,11 @@ namespace cv { namespace gpu { namespace device
}
template
<>
__device__
__forceinline__
uint
saturate_cast
<
uint
>
(
float
v
)
{
return
__float2uint_rn
(
v
);
{
return
__float2uint_rn
(
v
);
}
template
<>
__device__
__forceinline__
uint
saturate_cast
<
uint
>
(
double
v
)
{
template
<>
__device__
__forceinline__
uint
saturate_cast
<
uint
>
(
double
v
)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
return
__double2uint_rn
(
v
);
#else
...
...
modules/gpu/src/opencv2/gpu/device/scan.hpp
0 → 100644
View file @
e7f6c4b7
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_SCAN_HPP__
#define __OPENCV_GPU_SCAN_HPP__
enum
ScanKind
{
EXCLUSIVE
=
0
,
INCLUSIVE
=
1
};
template
<
ScanKind
Kind
,
typename
T
,
typename
F
>
struct
WarpScan
{
__device__
__forceinline__
WarpScan
()
{}
__device__
__forceinline__
WarpScan
(
const
WarpScan
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
,
const
unsigned
int
idx
)
{
const
unsigned
int
lane
=
idx
&
31
;
F
op
;
if
(
lane
>=
1
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
1
],
ptr
[
idx
]);
if
(
lane
>=
2
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
2
],
ptr
[
idx
]);
if
(
lane
>=
4
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
4
],
ptr
[
idx
]);
if
(
lane
>=
8
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
8
],
ptr
[
idx
]);
if
(
lane
>=
16
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
16
],
ptr
[
idx
]);
if
(
Kind
==
INCLUSIVE
)
return
ptr
[
idx
];
else
return
(
lane
>
0
)
?
ptr
[
idx
-
1
]
:
0
;
}
__device__
__forceinline__
unsigned
int
index
(
const
unsigned
int
tid
)
{
return
tid
;
}
__device__
__forceinline__
void
init
(
volatile
T
*
ptr
){}
static
const
int
warp_offset
=
0
;
typedef
WarpScan
<
INCLUSIVE
,
T
,
F
>
merge
;
};
template
<
ScanKind
Kind
,
typename
T
,
typename
F
>
struct
WarpScanNoComp
{
__device__
__forceinline__
WarpScanNoComp
()
{}
__device__
__forceinline__
WarpScanNoComp
(
const
WarpScanNoComp
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
,
const
unsigned
int
idx
)
{
const
unsigned
int
lane
=
threadIdx
.
x
&
31
;
F
op
;
ptr
[
idx
]
=
op
(
ptr
[
idx
-
1
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
2
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
4
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
8
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
16
],
ptr
[
idx
]);
if
(
Kind
==
INCLUSIVE
)
return
ptr
[
idx
];
else
return
(
lane
>
0
)
?
ptr
[
idx
-
1
]
:
0
;
}
__device__
__forceinline__
unsigned
int
index
(
const
unsigned
int
tid
)
{
return
(
tid
>>
warp_log
)
*
warp_smem_stride
+
16
+
(
tid
&
warp_mask
);
}
__device__
__forceinline__
void
init
(
volatile
T
*
ptr
)
{
ptr
[
threadIdx
.
x
]
=
0
;
}
static
const
int
warp_smem_stride
=
32
+
16
+
1
;
static
const
int
warp_offset
=
16
;
static
const
int
warp_log
=
5
;
static
const
int
warp_mask
=
31
;
typedef
WarpScanNoComp
<
INCLUSIVE
,
T
,
F
>
merge
;
};
template
<
ScanKind
Kind
,
typename
T
,
typename
Sc
,
typename
F
>
struct
BlockScan
{
__device__
__forceinline__
BlockScan
()
{}
__device__
__forceinline__
BlockScan
(
const
BlockScan
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
)
{
const
unsigned
int
tid
=
threadIdx
.
x
;
const
unsigned
int
lane
=
tid
&
warp_mask
;
const
unsigned
int
warp
=
tid
>>
warp_log
;
Sc
scan
;
typename
Sc
::
merge
merge_scan
;
const
unsigned
int
idx
=
scan
.
index
(
tid
);
T
val
=
scan
(
ptr
,
idx
);
__syncthreads
();
if
(
warp
==
0
)
scan
.
init
(
ptr
);
__syncthreads
();
if
(
lane
==
31
)
ptr
[
scan
.
warp_offset
+
warp
]
=
(
Kind
==
INCLUSIVE
)
?
val
:
ptr
[
idx
];
__syncthreads
();
if
(
warp
==
0
)
merge_scan
(
ptr
,
idx
);
__syncthreads
();
if
(
warp
>
0
)
val
=
ptr
[
scan
.
warp_offset
+
warp
-
1
]
+
val
;
__syncthreads
();
ptr
[
idx
]
=
val
;
__syncthreads
();
return
val
;
}
static
const
int
warp_log
=
5
;
static
const
int
warp_mask
=
31
;
};
#endif
\ No newline at end of file
modules/gpu/src/opencv2/gpu/device/static_check.hpp
View file @
e7f6c4b7
...
...
@@ -43,27 +43,27 @@
#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
#if defined(__CUDACC__)
#define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
#if defined(__CUDACC__)
#define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
#else
#define __OPENCV_GPU_HOST_DEVICE__
#endif
#endif
namespace
cv
{
namespace
gpu
{
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
bool
expr
>
struct
Static
{};
template
<>
struct
Static
<
true
>
{
__OPENCV_GPU_HOST_DEVICE__
static
void
check
()
{};
template
<>
struct
Static
<
true
>
{
__OPENCV_GPU_HOST_DEVICE__
static
void
check
()
{};
};
}
}
using
::
cv
::
gpu
::
device
::
Static
;
}}
#undef __OPENCV_GPU_HOST_DEVICE__
#endif
/* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
#endif
/* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
modules/gpu/src/opencv2/gpu/device/transform.hpp
View file @
e7f6c4b7
...
...
@@ -47,7 +47,7 @@
#include "utility.hpp"
#include "detail/transform_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
,
typename
D
,
typename
UnOp
,
typename
Mask
>
static
inline
void
transform
(
DevMem2D_
<
T
>
src
,
DevMem2D_
<
D
>
dst
,
UnOp
op
,
const
Mask
&
mask
,
cudaStream_t
stream
)
...
...
modules/gpu/src/opencv2/gpu/device/type_traits.hpp
View file @
e7f6c4b7
...
...
@@ -45,11 +45,11 @@
#include "detail/type_traits_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
>
struct
IsSimpleParameter
{
enum
{
value
=
type_traits_detail
::
IsIntegral
<
T
>::
value
||
type_traits_detail
::
IsFloat
<
T
>::
value
||
enum
{
value
=
type_traits_detail
::
IsIntegral
<
T
>::
value
||
type_traits_detail
::
IsFloat
<
T
>::
value
||
type_traits_detail
::
PointerTraits
<
typename
type_traits_detail
::
ReferenceTraits
<
T
>::
type
>::
value
};
};
...
...
@@ -65,16 +65,16 @@ namespace cv { namespace gpu { namespace device
enum
{
isVolatile
=
type_traits_detail
::
UnVolatile
<
T
>::
value
};
enum
{
isReference
=
type_traits_detail
::
ReferenceTraits
<
UnqualifiedType
>::
value
};
enum
{
isPointer
=
type_traits_detail
::
PointerTraits
<
typename
type_traits_detail
::
ReferenceTraits
<
UnqualifiedType
>::
type
>::
value
};
enum
{
isPointer
=
type_traits_detail
::
PointerTraits
<
typename
type_traits_detail
::
ReferenceTraits
<
UnqualifiedType
>::
type
>::
value
};
enum
{
isUnsignedInt
=
type_traits_detail
::
IsUnsignedIntegral
<
UnqualifiedType
>::
value
};
enum
{
isSignedInt
=
type_traits_detail
::
IsSignedIntergral
<
UnqualifiedType
>::
value
};
enum
{
isIntegral
=
type_traits_detail
::
IsIntegral
<
UnqualifiedType
>::
value
};
enum
{
isFloat
=
type_traits_detail
::
IsFloat
<
UnqualifiedType
>::
value
};
enum
{
isArith
=
isIntegral
||
isFloat
};
enum
{
isVec
=
type_traits_detail
::
IsVec
<
UnqualifiedType
>::
value
};
typedef
typename
type_traits_detail
::
Select
<
IsSimpleParameter
<
UnqualifiedType
>::
value
,
enum
{
isUnsignedInt
=
type_traits_detail
::
IsUnsignedIntegral
<
UnqualifiedType
>::
value
};
enum
{
isSignedInt
=
type_traits_detail
::
IsSignedIntergral
<
UnqualifiedType
>::
value
};
enum
{
isIntegral
=
type_traits_detail
::
IsIntegral
<
UnqualifiedType
>::
value
};
enum
{
isFloat
=
type_traits_detail
::
IsFloat
<
UnqualifiedType
>::
value
};
enum
{
isArith
=
isIntegral
||
isFloat
};
enum
{
isVec
=
type_traits_detail
::
IsVec
<
UnqualifiedType
>::
value
};
typedef
typename
type_traits_detail
::
Select
<
IsSimpleParameter
<
UnqualifiedType
>::
value
,
T
,
typename
type_traits_detail
::
AddParameterType
<
T
>::
type
>::
type
ParameterType
;
};
}}}
...
...
modules/gpu/src/opencv2/gpu/device/utility.hpp
View file @
e7f6c4b7
...
...
@@ -47,17 +47,17 @@
#include "datamov_utils.hpp"
#include "detail/utility_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
#define OPENCV_GPU_LOG_WARP_SIZE
(5)
#define OPENCV_GPU_WARP_SIZE
(1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_WARP_SIZE
(5)
#define OPENCV_GPU_WARP_SIZE
(1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
///////////////////////////////////////////////////////////////////////////////
// swap
template
<
typename
T
>
void
__device__
__host__
__forceinline__
swap
(
T
&
a
,
T
&
b
)
template
<
typename
T
>
void
__device__
__host__
__forceinline__
swap
(
T
&
a
,
T
&
b
)
{
const
T
temp
=
a
;
a
=
b
;
...
...
@@ -71,9 +71,9 @@ namespace cv { namespace gpu { namespace device
{
explicit
__host__
__device__
__forceinline__
SingleMask
(
PtrStepb
mask_
)
:
mask
(
mask_
)
{}
__host__
__device__
__forceinline__
SingleMask
(
const
SingleMask
&
mask_
)
:
mask
(
mask_
.
mask
){}
__device__
__forceinline__
bool
operator
()(
int
y
,
int
x
)
const
{
{
return
mask
.
ptr
(
y
)[
x
]
!=
0
;
}
...
...
@@ -82,13 +82,13 @@ namespace cv { namespace gpu { namespace device
struct
SingleMaskChannels
{
__host__
__device__
__forceinline__
SingleMaskChannels
(
PtrStepb
mask_
,
int
channels_
)
__host__
__device__
__forceinline__
SingleMaskChannels
(
PtrStepb
mask_
,
int
channels_
)
:
mask
(
mask_
),
channels
(
channels_
)
{}
__host__
__device__
__forceinline__
SingleMaskChannels
(
const
SingleMaskChannels
&
mask_
)
:
mask
(
mask_
.
mask
),
channels
(
mask_
.
channels
){}
__device__
__forceinline__
bool
operator
()(
int
y
,
int
x
)
const
{
{
return
mask
.
ptr
(
y
)[
x
/
channels
]
!=
0
;
}
...
...
@@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device
{
curMask
=
maskCollection
[
z
];
}
__device__
__forceinline__
bool
operator
()(
int
y
,
int
x
)
const
{
uchar
val
;
...
...
@@ -165,20 +165,20 @@ namespace cv { namespace gpu { namespace device
utility_detail
::
ReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
data
,
partial_reduction
,
tid
,
op
);
}
template
<
int
n
,
typename
T
,
typename
V
,
typename
Pred
>
template
<
int
n
,
typename
T
,
typename
V
,
typename
Pred
>
__device__
__forceinline__
void
reducePredVal
(
volatile
T
*
sdata
,
T
&
myData
,
V
*
sval
,
V
&
myVal
,
int
tid
,
const
Pred
&
pred
)
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
utility_detail
::
PredValReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal
,
sdata
,
sval
,
tid
,
pred
);
}
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
__device__
__forceinline__
void
reducePredVal2
(
volatile
T
*
sdata
,
T
&
myData
,
V1
*
sval1
,
V1
&
myVal1
,
V2
*
sval2
,
V2
&
myVal2
,
int
tid
,
const
Pred
&
pred
)
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
utility_detail
::
PredVal2ReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal1
,
myVal2
,
sdata
,
sval1
,
sval2
,
tid
,
pred
);
}
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
...
...
@@ -212,17 +212,17 @@ namespace cv { namespace gpu { namespace device
{
double
invdet
=
1.0
/
det
;
x
[
0
]
=
saturate_cast
<
T
>
(
invdet
*
x
[
0
]
=
saturate_cast
<
T
>
(
invdet
*
(
b
[
0
]
*
(
A
[
1
][
1
]
*
A
[
2
][
2
]
-
A
[
1
][
2
]
*
A
[
2
][
1
])
-
A
[
0
][
1
]
*
(
b
[
1
]
*
A
[
2
][
2
]
-
A
[
1
][
2
]
*
b
[
2
]
)
+
A
[
0
][
2
]
*
(
b
[
1
]
*
A
[
2
][
1
]
-
A
[
1
][
1
]
*
b
[
2
]
)));
x
[
1
]
=
saturate_cast
<
T
>
(
invdet
*
x
[
1
]
=
saturate_cast
<
T
>
(
invdet
*
(
A
[
0
][
0
]
*
(
b
[
1
]
*
A
[
2
][
2
]
-
A
[
1
][
2
]
*
b
[
2
]
)
-
b
[
0
]
*
(
A
[
1
][
0
]
*
A
[
2
][
2
]
-
A
[
1
][
2
]
*
A
[
2
][
0
])
+
A
[
0
][
2
]
*
(
A
[
1
][
0
]
*
b
[
2
]
-
b
[
1
]
*
A
[
2
][
0
])));
x
[
2
]
=
saturate_cast
<
T
>
(
invdet
*
x
[
2
]
=
saturate_cast
<
T
>
(
invdet
*
(
A
[
0
][
0
]
*
(
A
[
1
][
1
]
*
b
[
2
]
-
b
[
1
]
*
A
[
2
][
1
])
-
A
[
0
][
1
]
*
(
A
[
1
][
0
]
*
b
[
2
]
-
b
[
1
]
*
A
[
2
][
0
])
+
b
[
0
]
*
(
A
[
1
][
0
]
*
A
[
2
][
1
]
-
A
[
1
][
1
]
*
A
[
2
][
0
])));
...
...
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
View file @
e7f6c4b7
...
...
@@ -47,7 +47,7 @@
#include "functional.hpp"
#include "detail/vec_distance_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
>
struct
L1Dist
{
...
...
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
};
// calc distance between two vectors in global memory
template
<
int
THREAD_DIM
,
typename
Dist
,
typename
T1
,
typename
T2
>
template
<
int
THREAD_DIM
,
typename
Dist
,
typename
T1
,
typename
T2
>
__device__
void
calcVecDiffGlobal
(
const
T1
*
vec1
,
const
T2
*
vec2
,
int
len
,
Dist
&
dist
,
typename
Dist
::
result_type
*
smem
,
int
tid
)
{
for
(
int
i
=
tid
;
i
<
len
;
i
+=
THREAD_DIM
)
...
...
@@ -170,9 +170,9 @@ namespace cv { namespace gpu { namespace device
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
template
<
int
THREAD_DIM
,
int
MAX_LEN
,
bool
LEN_EQ_MAX_LEN
,
typename
Dist
,
typename
T1
,
typename
T2
>
__device__
__forceinline__
void
calcVecDiffCached
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
int
len
,
Dist
&
dist
,
typename
Dist
::
result_type
*
smem
,
int
tid
)
{
{
vec_distance_detail
::
VecDiffCachedCalculator
<
THREAD_DIM
,
MAX_LEN
,
LEN_EQ_MAX_LEN
>::
calc
(
vecCached
,
vecGlob
,
len
,
dist
,
tid
);
dist
.
reduceAll
<
THREAD_DIM
>
(
smem
,
tid
);
}
...
...
modules/gpu/src/opencv2/gpu/device/vec_math.hpp
View file @
e7f6c4b7
...
...
@@ -47,7 +47,7 @@
#include "vec_traits.hpp"
#include "functional.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
namespace
vec_math_detail
{
...
...
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
}
namespace
vec_math_detail
{
{
template
<
typename
T1
,
typename
T2
>
struct
BinOpTraits
{
typedef
int
argument_type
;
...
...
@@ -326,5 +326,5 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_VEC_OP
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
}}}
// namespace cv { namespace gpu { namespace device
#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
View file @
e7f6c4b7
...
...
@@ -45,7 +45,7 @@
#include "common.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
,
int
N
>
struct
TypeVec
;
...
...
@@ -219,18 +219,18 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
template
<>
struct
VecTraits
<
char
>
{
template
<>
struct
VecTraits
<
char
>
{
typedef
char
elem_type
;
enum
{
cn
=
1
};
enum
{
cn
=
1
};
static
__device__
__host__
__forceinline__
char
all
(
char
v
)
{
return
v
;}
static
__device__
__host__
__forceinline__
char
make
(
char
x
)
{
return
x
;}
static
__device__
__host__
__forceinline__
char
make
(
const
char
*
x
)
{
return
*
x
;}
};
template
<>
struct
VecTraits
<
schar
>
{
template
<>
struct
VecTraits
<
schar
>
{
typedef
schar
elem_type
;
enum
{
cn
=
1
};
enum
{
cn
=
1
};
static
__device__
__host__
__forceinline__
schar
all
(
schar
v
)
{
return
v
;}
static
__device__
__host__
__forceinline__
schar
make
(
schar
x
)
{
return
x
;}
static
__device__
__host__
__forceinline__
schar
make
(
const
schar
*
x
)
{
return
*
x
;}
...
...
modules/gpu/src/opencv2/gpu/device/warp.hpp
View file @
e7f6c4b7
...
...
@@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
#define __OPENCV_GPU_DEVICE_WARP_HPP__
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
struct
Warp
{
...
...
@@ -64,18 +64,18 @@ namespace cv { namespace gpu { namespace device
template
<
typename
It
,
typename
T
>
static
__device__
__forceinline__
void
fill
(
It
beg
,
It
end
,
const
T
&
value
)
{
{
for
(
It
t
=
beg
+
laneId
();
t
<
end
;
t
+=
STRIDE
)
*
t
=
value
;
}
}
template
<
typename
InIt
,
typename
OutIt
>
static
__device__
__forceinline__
OutIt
copy
(
InIt
beg
,
InIt
end
,
OutIt
out
)
{
{
for
(
InIt
t
=
beg
+
laneId
();
t
<
end
;
t
+=
STRIDE
,
out
+=
STRIDE
)
*
out
=
*
t
;
return
out
;
}
}
template
<
typename
InIt
,
typename
OutIt
,
class
UnOp
>
static
__device__
__forceinline__
OutIt
transform
(
InIt
beg
,
InIt
end
,
OutIt
out
,
UnOp
op
)
...
...
@@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device
{
unsigned
int
lane
=
laneId
();
InIt1
t1
=
beg1
+
lane
;
InIt1
t1
=
beg1
+
lane
;
InIt2
t2
=
beg2
+
lane
;
for
(;
t1
<
end1
;
t1
+=
STRIDE
,
t2
+=
STRIDE
,
out
+=
STRIDE
)
*
out
=
op
(
*
t1
,
*
t2
);
...
...
@@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device
template
<
typename
OutIt
,
typename
T
>
static
__device__
__forceinline__
void
yota
(
OutIt
beg
,
OutIt
end
,
T
value
)
{
unsigned
int
lane
=
laneId
();
unsigned
int
lane
=
laneId
();
value
+=
lane
;
for
(
OutIt
t
=
beg
+
lane
;
t
<
end
;
t
+=
STRIDE
,
value
+=
STRIDE
)
...
...
modules/gpu/src/resize.cpp
View file @
e7f6c4b7
...
...
@@ -80,51 +80,9 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
void
resize_gpu
(
DevMem2Db
src
,
DevMem2Db
srcWhole
,
int
xoff
,
int
yoff
,
float
fx
,
float
fy
,
DevMem2Db
dst
,
int
interpolation
,
cudaStream_t
stream
);
template
<
typename
T
>
void
resize_area_gpu
(
const
DevMem2Db
src
,
DevMem2Db
dst
,
float
fx
,
float
fy
,
int
interpolation
,
DevMem2Df
buffer
,
cudaStream_t
stream
);
}
}}}
void
cv
::
gpu
::
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
GpuMat
&
buffer
,
Size
dsize
,
double
fx
,
double
fy
,
int
interpolation
,
Stream
&
s
)
{
CV_Assert
(
src
.
depth
()
<=
CV_32F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
interpolation
==
INTER_AREA
);
CV_Assert
(
(
fx
<
1.0
)
&&
(
fy
<
1.0
));
CV_Assert
(
!
(
dsize
==
Size
())
||
(
fx
>
0
&&
fy
>
0
));
CV_Assert
(
src
.
cols
>=
128
&&
src
.
rows
>=
128
);
CV_Assert
((
fx
-
128.0
)
<=
0
&&
(
fy
-
128.0
)
<=
0
);
if
(
dsize
==
Size
())
dsize
=
Size
(
saturate_cast
<
int
>
(
src
.
cols
*
fx
),
saturate_cast
<
int
>
(
src
.
rows
*
fy
));
else
{
fx
=
static_cast
<
double
>
(
dsize
.
width
)
/
src
.
cols
;
fy
=
static_cast
<
double
>
(
dsize
.
height
)
/
src
.
rows
;
}
fx
=
static_cast
<
float
>
(
1.0
/
fx
);
fy
=
static_cast
<
float
>
(
1.0
/
fy
);
dst
.
create
(
dsize
,
src
.
type
());
buffer
.
create
(
cv
::
Size
(
dsize
.
width
,
src
.
rows
),
CV_32FC1
);
if
(
dsize
==
src
.
size
())
{
if
(
s
)
s
.
enqueueCopy
(
src
,
dst
);
else
src
.
copyTo
(
dst
);
return
;
}
cudaStream_t
stream
=
StreamAccessor
::
getStream
(
s
);
cv
::
gpu
::
device
::
imgproc
::
resize_area_gpu
<
uchar
>
(
src
,
dst
,
fx
,
fy
,
interpolation
,
buffer
,
stream
);
}
void
cv
::
gpu
::
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
dsize
,
double
fx
,
double
fy
,
int
interpolation
,
Stream
&
s
)
{
CV_Assert
(
src
.
depth
()
<=
CV_32F
&&
src
.
channels
()
<=
4
);
...
...
modules/gpu/test/test_resize.cpp
View file @
e7f6c4b7
...
...
@@ -182,45 +182,6 @@ PARAM_TEST_CASE(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, double, Inte
}
};
TEST_P
(
ResizeArea
,
Accuracy
)
{
cv
::
Mat
src
=
randomMat
(
size
,
type
);
cv
::
gpu
::
GpuMat
dst
=
createMat
(
cv
::
Size
(
cv
::
saturate_cast
<
int
>
(
src
.
cols
*
coeff
),
cv
::
saturate_cast
<
int
>
(
src
.
rows
*
coeff
)),
type
,
useRoi
);
cv
::
gpu
::
GpuMat
buffer
=
createMat
(
cv
::
Size
(
dst
.
cols
,
src
.
rows
),
CV_32FC1
);
cv
::
gpu
::
resize
(
loadMat
(
src
,
useRoi
),
dst
,
buffer
,
cv
::
Size
(),
coeff
,
coeff
,
interpolation
);
cv
::
Mat
dst_cpu
;
cv
::
resize
(
src
,
dst_cpu
,
cv
::
Size
(),
coeff
,
coeff
,
interpolation
);
cv
::
Mat
gpu_buff
;
buffer
.
download
(
gpu_buff
);
cv
::
Mat
gpu
;
dst
.
download
(
gpu
);
// std::cout // << src
// // << std::endl << std::endl
// // << gpu_buff
// // << std::endl << std::endl
// << gpu
// << std::endl << std::endl
// << dst_cpu<< std::endl;
EXPECT_MAT_NEAR
(
dst_cpu
,
dst
,
src
.
depth
()
==
CV_32F
?
1e-2
:
1.0
);
}
INSTANTIATE_TEST_CASE_P
(
GPU_ImgProc
,
ResizeArea
,
testing
::
Combine
(
ALL_DEVICES
,
testing
::
Values
(
cv
::
Size
(
640
,
480
)),
//DIFFERENT_SIZES,
testing
::
Values
(
MatType
(
CV_8UC1
)
/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/
),
testing
::
Values
(
0.05
,
0.1
),
testing
::
Values
(
Interpolation
(
cv
::
INTER_AREA
)),
WHOLE_SUBMAT
));
///////////////////////////////////////////////////////////////////
// Test NPP
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment