Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
5132ce21
Commit
5132ce21
authored
Dec 17, 2010
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
reimplemented gpu::bitwise_not operation, refactored gpu module
parent
1a93412e
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
251 additions
and
151 deletions
+251
-151
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+239
-149
bitwise_oper.cpp
tests/gpu/src/bitwise_oper.cpp
+12
-2
No files found.
modules/gpu/src/cuda/mathfunc.cu
View file @
5132ce21
...
...
@@ -59,7 +59,7 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace mathfunc
{
template <int size, typename T>
__device__ void sum_in_smem(volatile T* data, const u
nsigned
int tid)
__device__ void sum_in_smem(volatile T* data, const uint tid)
{
T sum = data[tid];
...
...
@@ -270,102 +270,183 @@ namespace cv { namespace gpu { namespace mathfunc
struct Mask8U
{
explicit Mask8U(PtrStep mask): mask(mask) {}
__device__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
__device__ bool operator()(int y, int x) const
{
return mask.ptr(y)[x];
}
PtrStep mask;
};
struct MaskTrue { __device__ bool operator()(int y, int x) const { return true; } };
struct MaskTrue
{
__device__ bool operator()(int y, int x) const
{
return true;
}
};
//------------------------------------------------------------------------
// Unary operations
enum { UN_OP_NOT };
enum
{
UN_OP_NOT
};
template <typename T, int opid>
struct UnOp;
template <typename T>
struct UnOp<T, UN_OP_NOT>
{
static __device__ T call(T x)
{
return ~x;
}
typedef typename TypeVec<T, 2>::vec_t Vec2;
typedef typename TypeVec<T, 3>::vec_t Vec3;
typedef typename TypeVec<T, 4>::vec_t Vec4;
static __device__ T call(T v) { return ~v; }
static __device__ Vec2 call(Vec2 v) { return VecTraits<Vec2>::make(~v.x, ~v.y); }
static __device__ Vec3 call(Vec3 v) { return VecTraits<Vec3>::make(~v.x, ~v.y, ~v.z); }
static __device__ Vec4 call(Vec4 v) { return VecTraits<Vec4>::make(~v.x, ~v.y, ~v.z, ~v.w); }
};
template <typename T, int cn, typename UnOp, typename Mask>
__global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, Mask mask)
template <int opid>
__global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst)
{
const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (y < rows)
{
uchar* dst_ptr = dst.ptr(y) + x;
const uchar* src_ptr = src.ptr(y) + x;
if (x + sizeof(uint) - 1 < cols)
{
*(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
}
else
{
const uchar* src_end = src.ptr(y) + cols;
while (src_ptr < src_end)
{
*dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
}
}
}
}
template <typename T, int cn, int opid>
__global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, const PtrStep mask)
{
typedef typename TypeVec<T, cn>::vec_t Type;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < cols && y < rows && mask
(y, x)
)
if (x < cols && y < rows && mask
.ptr(y)[x]
)
{
T* dsty = (T*)dst.ptr(y);
const T* srcy = (const T*)src.ptr(y);
Type* dst_row = (Type*)dst.ptr(y);
const Type* src_row = (const Type*)src.ptr(y);
dst_row[x] = UnOp<T, opid>::call(src_row[x]);
}
}
#pragma unroll
for (int i = 0; i < cn; ++i)
dsty[cn * x + i] = UnOp::call(srcy[cn * x + i]);
template <typename T, int cn, int opid>
__global__ void bitwise_un_op_two_loads(int rows, int cols, const PtrStep src, PtrStep dst, const PtrStep mask)
{
typedef typename TypeVec<T, cn>::vec_t Type;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < cols && y < rows && mask.ptr(y)[x])
{
Type* dst_row = (Type*)dst.ptr(y);
const Type* src_row = (const Type*)src.ptr(y);
dst_row[2 * x] = UnOp<T, opid>::call(src_row[2 * x]);
dst_row[2 * x + 1] = UnOp<T, opid>::call(src_row[2 * x + 1]);
}
}
template <int opid, typename Mask>
void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, Mask mask, cudaStream_t stream)
template <int opid>
void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols * elem_size, threads.x * sizeof(uint)),
divUp(rows, threads.y));
bitwise_un_op<opid><<<grid, threads>>>(rows, cols * elem_size, src, dst);
if (stream == 0) cudaSafeCall(cudaThreadSynchronize());
}
template <int opid>
void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, const PtrStep mask, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
switch (elem_size)
{
case 1:
bitwise_un_op<u
nsigned char, 1, UnOp<unsigned char, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op<u
char, 1, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 2:
bitwise_un_op<u
nsigned short, 1, UnOp<unsigned short, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op<u
short, 1, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 3:
bitwise_un_op<u
nsigned char, 3, UnOp<unsigned char, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op<u
char, 3, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 4:
bitwise_un_op<u
nsigned int, 1, UnOp<unsigned int, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op<u
int, 1, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 6:
bitwise_un_op<u
nsigned short, 3, UnOp<unsigned short, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op<u
short, 3, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 8:
bitwise_un_op<u
nsigned int, 2, UnOp<unsigned int, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op<u
int, 2, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 12:
bitwise_un_op<u
nsigned int, 3, UnOp<unsigned int, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op<u
int, 3, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 16:
bitwise_un_op<u
nsigned int, 4, UnOp<unsigned int, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op<u
int, 4, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 24:
bitwise_un_op
<unsigned int, 6, UnOp<unsigned int, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op
_two_loads<uint, 3, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 32:
bitwise_un_op
<unsigned int, 8, UnOp<unsigned int, opid>
><<<grid, threads>>>(rows, cols, src, dst, mask);
bitwise_un_op
_two_loads<uint, 4, opid
><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
}
if (stream == 0) cudaSafeCall(cudaThreadSynchronize());
if (stream == 0) cudaSafeCall(cudaThreadSynchronize());
}
void bitwise_not_caller(int rows, int cols,const PtrStep src, int elem_size, PtrStep dst, cudaStream_t stream)
void bitwise_not_caller(int rows, int cols, const PtrStep src, int elem_size, PtrStep dst, cudaStream_t stream)
{
bitwise_un_op<UN_OP_NOT>(rows, cols, src, dst, elem_size,
MaskTrue(),
stream);
bitwise_un_op<UN_OP_NOT>(rows, cols, src, dst, elem_size, stream);
}
void bitwise_not_caller(int rows, int cols,const PtrStep src, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream)
{
bitwise_un_op<UN_OP_NOT>(rows, cols, src, dst, elem_size,
Mask8U(mask)
, stream);
bitwise_un_op<UN_OP_NOT>(rows, cols, src, dst, elem_size,
mask
, stream);
}
//------------------------------------------------------------------------
// Binary operations
enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
template <typename T, int opid>
struct BinOp;
template <typename T>
struct BinOp<T, BIN_OP_OR>
{
...
...
@@ -375,6 +456,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
};
template <typename T>
struct BinOp<T, BIN_OP_AND>
{
...
...
@@ -393,6 +475,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
};
template <typename T, int cn, typename BinOp, typename Mask>
__global__ void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst, Mask mask)
{
...
...
@@ -411,6 +494,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
}
template <int opid, typename Mask>
void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst, int elem_size, Mask mask, cudaStream_t stream)
{
...
...
@@ -419,64 +503,70 @@ namespace cv { namespace gpu { namespace mathfunc
switch (elem_size)
{
case 1:
bitwise_bin_op<u
nsigned char, 1, BinOp<unsigned
char, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
char, 1, BinOp<u
char, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 2:
bitwise_bin_op<u
nsigned short, 1, BinOp<unsigned
short, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
short, 1, BinOp<u
short, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 3:
bitwise_bin_op<u
nsigned char, 3, BinOp<unsigned
char, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
char, 3, BinOp<u
char, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 4:
bitwise_bin_op<u
nsigned int, 1, BinOp<unsigned
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
int, 1, BinOp<u
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 6:
bitwise_bin_op<u
nsigned short, 3, BinOp<unsigned
short, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
short, 3, BinOp<u
short, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 8:
bitwise_bin_op<u
nsigned int, 2, BinOp<unsigned
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
int, 2, BinOp<u
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 12:
bitwise_bin_op<u
nsigned int, 3, BinOp<unsigned
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
int, 3, BinOp<u
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 16:
bitwise_bin_op<u
nsigned int, 4, BinOp<unsigned
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
int, 4, BinOp<u
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 24:
bitwise_bin_op<u
nsigned int, 6, BinOp<unsigned
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
int, 6, BinOp<u
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 32:
bitwise_bin_op<u
nsigned int, 8, BinOp<unsigned
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
bitwise_bin_op<u
int, 8, BinOp<u
int, opid> ><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
}
if (stream == 0) cudaSafeCall(cudaThreadSynchronize());
}
void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_OR>(rows, cols, src1, src2, dst, elem_size, MaskTrue(), stream);
}
void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_OR>(rows, cols, src1, src2, dst, elem_size, Mask8U(mask), stream);
}
void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_AND>(rows, cols, src1, src2, dst, elem_size, MaskTrue(), stream);
}
void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_AND>(rows, cols, src1, src2, dst, elem_size, Mask8U(mask), stream);
}
void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_XOR>(rows, cols, src1, src2, dst, elem_size, MaskTrue(), stream);
}
void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_XOR>(rows, cols, src1, src2, dst, elem_size, Mask8U(mask), stream);
...
...
@@ -490,9 +580,9 @@ namespace cv { namespace gpu { namespace mathfunc
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits {};
template <> struct MinMaxTypeTraits<u
nsigned
char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<u
nsigned
short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
...
...
@@ -506,7 +596,7 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ u
nsigned
int blocks_finished = 0;
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
...
...
@@ -542,7 +632,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Does min and max in shared memory
template <typename T>
__device__ void merge(u
nsigned int tid, unsigned
int offset, volatile T* minval, volatile T* maxval)
__device__ void merge(u
int tid, u
int offset, volatile T* minval, volatile T* maxval)
{
minval[tid] = min(minval[tid], minval[tid + offset]);
maxval[tid] = max(maxval[tid], maxval[tid + offset]);
...
...
@@ -550,7 +640,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T>
__device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const u
nsigned
int tid)
__device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
...
...
@@ -575,18 +665,18 @@ namespace cv { namespace gpu { namespace mathfunc
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
u
nsigned
int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
u
nsigned
int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
u
nsigned
int tid = threadIdx.y * blockDim.x + threadIdx.x;
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
u
nsigned
int y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
u
nsigned
int x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (u
nsigned
int y = y0; y < y_end; y += blockDim.y)
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* src_row = (const T*)src.ptr(y);
for (u
nsigned
int x = x0; x < x_end; x += blockDim.x)
for (uint x = x0; x < x_end; x += blockDim.x)
{
T val = src_row[x];
if (mask(y, x))
...
...
@@ -618,7 +708,7 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
__threadfence();
u
nsigned
int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
...
...
@@ -626,7 +716,7 @@ namespace cv { namespace gpu { namespace mathfunc
if (is_last)
{
u
nsigned
int idx = min(tid, gridDim.x * gridDim.y - 1);
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
...
...
@@ -671,9 +761,9 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
}
template void min_max_mask_caller<u
nsigned
char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<u
nsigned
short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
...
...
@@ -700,9 +790,9 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
}
template void min_max_caller<u
nsigned
char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<u
nsigned
short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<float>(const DevMem2D, double*,double*, PtrStep);
...
...
@@ -716,8 +806,8 @@ namespace cv { namespace gpu { namespace mathfunc
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
u
nsigned
int tid = threadIdx.y * blockDim.x + threadIdx.x;
u
nsigned
int idx = min(tid, gridDim.x * gridDim.y - 1);
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
...
...
@@ -754,9 +844,9 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
}
template void min_max_mask_multipass_caller<u
nsigned
char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<u
nsigned
short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
...
...
@@ -783,9 +873,9 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
}
template void min_max_multipass_caller<u
nsigned
char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<u
nsigned
short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<float>(const DevMem2D, double*, double*, PtrStep);
...
...
@@ -801,7 +891,7 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ u
nsigned
int blocks_finished = 0;
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
...
...
@@ -839,8 +929,8 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
__device__ void merge(u
nsigned int tid, unsigned
int offset, volatile T* minval, volatile T* maxval,
volatile u
nsigned int* minloc, volatile unsigned
int* maxloc)
__device__ void merge(u
int tid, u
int offset, volatile T* minval, volatile T* maxval,
volatile u
int* minloc, volatile u
int* maxloc)
{
T val = minval[tid + offset];
if (val < minval[tid])
...
...
@@ -858,8 +948,8 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T>
__device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile u
nsigned
int* minloc,
volatile u
nsigned int* maxloc, const unsigned
int tid)
__device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile u
int* maxloc, const u
int tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
...
...
@@ -879,29 +969,29 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T, typename Mask>
__global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
u
nsigned int* minloc, unsigned
int* maxloc)
u
int* minloc, u
int* maxloc)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ u
nsigned
int sminloc[nthreads];
__shared__ u
nsigned
int smaxloc[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
u
nsigned
int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
u
nsigned
int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
u
nsigned
int tid = threadIdx.y * blockDim.x + threadIdx.x;
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
u
nsigned
int myminloc = 0;
u
nsigned
int mymaxloc = 0;
u
nsigned
int y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
u
nsigned
int x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
uint myminloc = 0;
uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (u
nsigned
int y = y0; y < y_end; y += blockDim.y)
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* ptr = (const T*)src.ptr(y);
for (u
nsigned
int x = x0; x < x_end; x += blockDim.x)
for (uint x = x0; x < x_end; x += blockDim.x)
{
if (mask(y, x))
{
...
...
@@ -931,7 +1021,7 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
__threadfence();
u
nsigned
int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
...
...
@@ -939,7 +1029,7 @@ namespace cv { namespace gpu { namespace mathfunc
if (is_last)
{
u
nsigned
int idx = min(tid, gridDim.x * gridDim.y - 1);
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
...
...
@@ -980,8 +1070,8 @@ namespace cv { namespace gpu { namespace mathfunc
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
u
nsigned int* minloc_buf = (unsigned
int*)locbuf.ptr(0);
u
nsigned int* maxloc_buf = (unsigned
int*)locbuf.ptr(1);
u
int* minloc_buf = (u
int*)locbuf.ptr(0);
u
int* maxloc_buf = (u
int*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -992,16 +1082,16 @@ namespace cv { namespace gpu { namespace mathfunc
*minval = minval_;
*maxval = maxval_;
u
nsigned
int minloc_, maxloc_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_caller<u
nsigned
char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<u
nsigned
short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
...
...
@@ -1018,8 +1108,8 @@ namespace cv { namespace gpu { namespace mathfunc
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
u
nsigned int* minloc_buf = (unsigned
int*)locbuf.ptr(0);
u
nsigned int* maxloc_buf = (unsigned
int*)locbuf.ptr(1);
u
int* minloc_buf = (u
int*)locbuf.ptr(0);
u
int* maxloc_buf = (u
int*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -1030,16 +1120,16 @@ namespace cv { namespace gpu { namespace mathfunc
*minval = minval_;
*maxval = maxval_;
u
nsigned
int minloc_, maxloc_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_caller<u
nsigned
char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<u
nsigned
short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
...
...
@@ -1048,16 +1138,16 @@ namespace cv { namespace gpu { namespace mathfunc
// This kernel will be used only when compute capability is 1.0
template <int nthreads, typename T>
__global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, u
nsigned int* minloc, unsigned
int* maxloc, int size)
__global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, u
int* minloc, u
int* maxloc, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ u
nsigned
int sminloc[nthreads];
__shared__ u
nsigned
int smaxloc[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
u
nsigned
int tid = threadIdx.y * blockDim.x + threadIdx.x;
u
nsigned
int idx = min(tid, gridDim.x * gridDim.y - 1);
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
...
...
@@ -1087,8 +1177,8 @@ namespace cv { namespace gpu { namespace mathfunc
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
u
nsigned int* minloc_buf = (unsigned
int*)locbuf.ptr(0);
u
nsigned int* maxloc_buf = (unsigned
int*)locbuf.ptr(1);
u
int* minloc_buf = (u
int*)locbuf.ptr(0);
u
int* maxloc_buf = (u
int*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
...
...
@@ -1100,16 +1190,16 @@ namespace cv { namespace gpu { namespace mathfunc
*minval = minval_;
*maxval = maxval_;
u
nsigned
int minloc_, maxloc_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_multipass_caller<u
nsigned
char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<u
nsigned
short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
...
...
@@ -1125,8 +1215,8 @@ namespace cv { namespace gpu { namespace mathfunc
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
u
nsigned int* minloc_buf = (unsigned
int*)locbuf.ptr(0);
u
nsigned int* maxloc_buf = (unsigned
int*)locbuf.ptr(1);
u
int* minloc_buf = (u
int*)locbuf.ptr(0);
u
int* maxloc_buf = (u
int*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
...
...
@@ -1138,16 +1228,16 @@ namespace cv { namespace gpu { namespace mathfunc
*minval = minval_;
*maxval = maxval_;
u
nsigned
int minloc_, maxloc_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_multipass_caller<u
nsigned
char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<u
nsigned
short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
...
...
@@ -1163,7 +1253,7 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ u
nsigned
int blocks_finished = 0;
__device__ uint blocks_finished = 0;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
...
...
@@ -1193,26 +1283,26 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile u
nsigned
int* count)
__global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count)
{
__shared__ u
nsigned
int scount[nthreads];
__shared__ uint scount[nthreads];
u
nsigned
int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
u
nsigned
int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
u
nsigned
int tid = threadIdx.y * blockDim.x + threadIdx.x;
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
u
nsigned
int cnt = 0;
for (u
nsigned
int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
uint cnt = 0;
for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (u
nsigned
int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
cnt += ptr[x0 + x * blockDim.x] != 0;
}
scount[tid] = cnt;
__syncthreads();
sum_in_smem<nthreads, u
nsigned
int>(scount, tid);
sum_in_smem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
...
...
@@ -1222,7 +1312,7 @@ namespace cv { namespace gpu { namespace mathfunc
count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
__threadfence();
u
nsigned
int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
...
...
@@ -1233,7 +1323,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, u
nsigned
int>(scount, tid);
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
{
...
...
@@ -1254,20 +1344,20 @@ namespace cv { namespace gpu { namespace mathfunc
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
u
nsigned int* count_buf = (unsigned
int*)buf.ptr(0);
u
int* count_buf = (u
int*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
u
nsigned
int count;
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_caller<u
nsigned
char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<u
nsigned
short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
...
...
@@ -1275,15 +1365,15 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T>
__global__ void count_non_zero_pass2_kernel(u
nsigned
int* count, int size)
__global__ void count_non_zero_pass2_kernel(uint* count, int size)
{
__shared__ u
nsigned
int scount[nthreads];
u
nsigned
int tid = threadIdx.y * blockDim.x + threadIdx.x;
__shared__ uint scount[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, u
nsigned
int>(scount, tid);
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
count[0] = scount[0];
...
...
@@ -1297,21 +1387,21 @@ namespace cv { namespace gpu { namespace mathfunc
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
u
nsigned int* count_buf = (unsigned
int*)buf.ptr(0);
u
int* count_buf = (u
int*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
u
nsigned
int count;
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_multipass_caller<u
nsigned
char>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<u
nsigned
short>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<float>(const DevMem2D, PtrStep);
...
...
@@ -1485,9 +1575,9 @@ namespace cv { namespace gpu { namespace mathfunc
{
template <typename T> struct SumType {};
template <> struct SumType<u
nsigned char> { typedef unsigned
int R; };
template <> struct SumType<u
char> { typedef u
int R; };
template <> struct SumType<char> { typedef int R; };
template <> struct SumType<u
nsigned short> { typedef unsigned
int R; };
template <> struct SumType<u
short> { typedef u
int R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
...
...
@@ -1501,7 +1591,7 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ u
nsigned
int blocks_finished = 0;
__device__ uint blocks_finished = 0;
const int threads_x = 32;
const int threads_y = 8;
...
...
@@ -1564,7 +1654,7 @@ namespace cv { namespace gpu { namespace mathfunc
result[bid] = smem[0];
__threadfence();
u
nsigned
int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
...
...
@@ -1648,7 +1738,7 @@ namespace cv { namespace gpu { namespace mathfunc
result[bid] = res;
__threadfence();
u
nsigned
int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
...
...
@@ -1756,7 +1846,7 @@ namespace cv { namespace gpu { namespace mathfunc
result[bid] = res;
__threadfence();
u
nsigned
int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
...
...
@@ -1874,7 +1964,7 @@ namespace cv { namespace gpu { namespace mathfunc
result[bid] = res;
__threadfence();
u
nsigned
int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
...
...
@@ -1996,9 +2086,9 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
}
template void sum_multipass_caller<u
nsigned
char>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<u
nsigned
short>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
...
...
@@ -2044,9 +2134,9 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
}
template void sum_caller<u
nsigned
char>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<u
nsigned
short>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<float>(const DevMem2D, PtrStep, double*, int);
...
...
@@ -2100,9 +2190,9 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
}
template void sqsum_multipass_caller<u
nsigned
char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<u
nsigned
short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
...
...
@@ -2148,9 +2238,9 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
}
template void sqsum_caller<u
nsigned
char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<u
nsigned
short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);
...
...
tests/gpu/src/bitwise_oper.cpp
View file @
5132ce21
...
...
@@ -44,7 +44,7 @@
#include "gputest.hpp"
#define CHECK(pred, err) if (!(pred)) { \
ts
->
printf
(
CvTS
::
LOG
,
"Fail:
\"
%s
\"
at line: %d
\n
"
,
#
pred
,
__LINE__
);
\
ts
->
printf
(
CvTS
::
CONSOLE
,
"Fail:
\"
%s
\"
at line: %d
\n
"
,
#
pred
,
__LINE__
);
\
ts
->
set_failed_test_info
(
err
);
\
return
;
}
...
...
@@ -88,12 +88,22 @@ struct CV_GpuBitwiseTest: public CvTest
mask
.
setTo
(
Scalar
(
1
));
gpu
::
GpuMat
dst
;
gpu
::
bitwise_not
(
gpu
::
GpuMat
(
src
),
dst
,
mask
);
gpu
::
bitwise_not
(
gpu
::
GpuMat
(
src
),
dst
);
CHECK
(
dst_gold
.
size
()
==
dst
.
size
(),
CvTS
::
FAIL_INVALID_OUTPUT
);
CHECK
(
dst_gold
.
type
()
==
dst
.
type
(),
CvTS
::
FAIL_INVALID_OUTPUT
);
Mat
dsth
(
dst
);
for
(
int
i
=
0
;
i
<
dst_gold
.
rows
;
++
i
)
CHECK
(
memcmp
(
dst_gold
.
ptr
(
i
),
dsth
.
ptr
(
i
),
dst_gold
.
cols
*
dst_gold
.
elemSize
())
==
0
,
CvTS
::
FAIL_INVALID_OUTPUT
);
dst
.
setTo
(
Scalar
::
all
(
0
));
gpu
::
bitwise_not
(
gpu
::
GpuMat
(
src
),
dst
,
mask
);
CHECK
(
dst_gold
.
size
()
==
dst
.
size
(),
CvTS
::
FAIL_INVALID_OUTPUT
);
CHECK
(
dst_gold
.
type
()
==
dst
.
type
(),
CvTS
::
FAIL_INVALID_OUTPUT
);
dsth
=
dst
;
for
(
int
i
=
0
;
i
<
dst_gold
.
rows
;
++
i
)
CHECK
(
memcmp
(
dst_gold
.
ptr
(
i
),
dsth
.
ptr
(
i
),
dst_gold
.
cols
*
dst_gold
.
elemSize
())
==
0
,
CvTS
::
FAIL_INVALID_OUTPUT
)
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment