Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
e62bf3a2
Commit
e62bf3a2
authored
Dec 20, 2010
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated gpu bitwise operations
parent
0f30fe08
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
182 additions
and
207 deletions
+182
-207
arithm.cpp
modules/gpu/src/arithm.cpp
+95
-25
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+87
-182
No files found.
modules/gpu/src/arithm.cpp
View file @
e62bf3a2
...
...
@@ -1002,18 +1002,25 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat&
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
void
bitwise_not_caller
(
int
rows
,
int
cols
,
const
PtrStep
src
,
int
elemSize
,
PtrStep
dst
,
cudaStream_t
stream
);
void
bitwise_not_caller
(
int
rows
,
int
cols
,
const
PtrStep
src
,
int
elemSize
,
PtrStep
dst
,
const
PtrStep
mask
,
cudaStream_t
stream
);
void
bitwise_or_caller
(
int
rows
,
int
cols
,
const
PtrStep
src1
,
const
PtrStep
src2
,
int
elemSize
,
PtrStep
dst
,
cudaStream_t
stream
);
void
bitwise_or_caller
(
int
rows
,
int
cols
,
const
PtrStep
src1
,
const
PtrStep
src2
,
int
elemSize
,
PtrStep
dst
,
const
PtrStep
mask
,
cudaStream_t
stream
);
void
bitwise_and_caller
(
int
rows
,
int
cols
,
const
PtrStep
src1
,
const
PtrStep
src2
,
int
elemSize
,
PtrStep
dst
,
cudaStream_t
stream
);
void
bitwise_and_caller
(
int
rows
,
int
cols
,
const
PtrStep
src1
,
const
PtrStep
src2
,
int
elemSize
,
PtrStep
dst
,
const
PtrStep
mask
,
cudaStream_t
stream
);
void
bitwise_xor_caller
(
int
rows
,
int
cols
,
const
PtrStep
src1
,
const
PtrStep
src2
,
int
elemSize
,
PtrStep
dst
,
cudaStream_t
stream
);
void
bitwise_xor_caller
(
int
rows
,
int
cols
,
const
PtrStep
src1
,
const
PtrStep
src2
,
int
elemSize
,
PtrStep
dst
,
const
PtrStep
mask
,
cudaStream_t
stream
);
template
<
int
opid
,
typename
Mask
>
void
bitwise_bin_op
(
int
rows
,
int
cols
,
const
PtrStep
src1
,
const
PtrStep
src2
,
PtrStep
dst
,
int
elem_size
,
Mask
mask
,
cudaStream_t
stream
);
void
bitwise_not_caller
(
int
rows
,
int
cols
,
int
elem_size1
,
int
cn
,
const
PtrStep
src
,
PtrStep
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
bitwise_mask_not_caller
(
int
rows
,
int
cols
,
int
cn
,
const
PtrStep
src
,
const
PtrStep
mask
,
PtrStep
dst
,
cudaStream_t
stream
);
void
bitwise_or_caller
(
int
rows
,
int
cols
,
int
elem_size1
,
int
cn
,
const
PtrStep
src1
,
const
PtrStep
src2
,
PtrStep
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
bitwise_mask_or_caller
(
int
rows
,
int
cols
,
int
cn
,
const
PtrStep
src1
,
const
PtrStep
src2
,
const
PtrStep
mask
,
PtrStep
dst
,
cudaStream_t
stream
);
void
bitwise_and_caller
(
int
rows
,
int
cols
,
int
elem_size1
,
int
cn
,
const
PtrStep
src1
,
const
PtrStep
src2
,
PtrStep
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
bitwise_mask_and_caller
(
int
rows
,
int
cols
,
int
cn
,
const
PtrStep
src1
,
const
PtrStep
src2
,
const
PtrStep
mask
,
PtrStep
dst
,
cudaStream_t
stream
);
void
bitwise_xor_caller
(
int
rows
,
int
cols
,
int
elem_size1
,
int
cn
,
const
PtrStep
src1
,
const
PtrStep
src2
,
PtrStep
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
bitwise_mask_xor_caller
(
int
rows
,
int
cols
,
int
cn
,
const
PtrStep
src1
,
const
PtrStep
src2
,
const
PtrStep
mask
,
PtrStep
dst
,
cudaStream_t
stream
);
}}}
namespace
...
...
@@ -1021,60 +1028,123 @@ namespace
void
bitwise_not_caller
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
dst
.
create
(
src
.
size
(),
src
.
type
());
mathfunc
::
bitwise_not_caller
(
src
.
rows
,
src
.
cols
,
src
,
src
.
elemSize
(),
dst
,
stream
);
cv
::
gpu
::
mathfunc
::
bitwise_not_caller
(
src
.
rows
,
src
.
cols
,
src
.
elemSize1
(),
dst
.
channels
(),
src
,
dst
,
stream
);
}
void
bitwise_not_caller
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
using
namespace
cv
::
gpu
;
typedef
void
(
*
Caller
)(
int
,
int
,
int
,
const
PtrStep
,
const
PtrStep
,
PtrStep
,
cudaStream_t
);
static
Caller
callers
[]
=
{
mathfunc
::
bitwise_mask_not_caller
<
unsigned
char
>
,
mathfunc
::
bitwise_mask_not_caller
<
unsigned
char
>
,
mathfunc
::
bitwise_mask_not_caller
<
unsigned
short
>
,
mathfunc
::
bitwise_mask_not_caller
<
unsigned
short
>
,
mathfunc
::
bitwise_mask_not_caller
<
unsigned
int
>
,
mathfunc
::
bitwise_mask_not_caller
<
unsigned
int
>
,
mathfunc
::
bitwise_mask_not_caller
<
unsigned
int
>
};
CV_Assert
(
mask
.
type
()
==
CV_8U
&&
mask
.
size
()
==
src
.
size
());
dst
.
create
(
src
.
size
(),
src
.
type
());
mathfunc
::
bitwise_not_caller
(
src
.
rows
,
src
.
cols
,
src
,
src
.
elemSize
(),
dst
,
mask
,
stream
);
Caller
caller
=
callers
[
src
.
depth
()];
CV_Assert
(
caller
);
int
cn
=
src
.
depth
()
!=
CV_64F
?
src
.
channels
()
:
src
.
channels
()
*
(
sizeof
(
double
)
/
sizeof
(
unsigned
int
));
caller
(
src
.
rows
,
src
.
cols
,
cn
,
src
,
mask
,
dst
,
stream
);
}
void
bitwise_or_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
bitwise_or_caller
(
dst
.
rows
,
dst
.
cols
,
src1
,
src2
,
dst
.
elemSize
(),
dst
,
stream
);
cv
::
gpu
::
mathfunc
::
bitwise_or_caller
(
dst
.
rows
,
dst
.
cols
,
dst
.
elemSize1
(),
dst
.
channels
(),
src1
,
src2
,
dst
,
stream
);
}
void
bitwise_or_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
using
namespace
cv
::
gpu
;
typedef
void
(
*
Caller
)(
int
,
int
,
int
,
const
PtrStep
,
const
PtrStep
,
const
PtrStep
,
PtrStep
,
cudaStream_t
);
static
Caller
callers
[]
=
{
mathfunc
::
bitwise_mask_or_caller
<
unsigned
char
>
,
mathfunc
::
bitwise_mask_or_caller
<
unsigned
char
>
,
mathfunc
::
bitwise_mask_or_caller
<
unsigned
short
>
,
mathfunc
::
bitwise_mask_or_caller
<
unsigned
short
>
,
mathfunc
::
bitwise_mask_or_caller
<
unsigned
int
>
,
mathfunc
::
bitwise_mask_or_caller
<
unsigned
int
>
,
mathfunc
::
bitwise_mask_or_caller
<
unsigned
int
>
};
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
CV_Assert
(
mask
.
type
()
==
CV_8U
&&
mask
.
size
()
==
src1
.
size
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
bitwise_or_caller
(
dst
.
rows
,
dst
.
cols
,
src1
,
src2
,
dst
.
elemSize
(),
dst
,
mask
,
stream
);
Caller
caller
=
callers
[
src1
.
depth
()];
CV_Assert
(
caller
);
int
cn
=
dst
.
depth
()
!=
CV_64F
?
dst
.
channels
()
:
dst
.
channels
()
*
(
sizeof
(
double
)
/
sizeof
(
unsigned
int
));
caller
(
dst
.
rows
,
dst
.
cols
,
cn
,
src1
,
src2
,
mask
,
dst
,
stream
);
}
void
bitwise_and_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
bitwise_and_caller
(
dst
.
rows
,
dst
.
cols
,
src1
,
src2
,
dst
.
elemSize
(),
dst
,
stream
);
cv
::
gpu
::
mathfunc
::
bitwise_and_caller
(
dst
.
rows
,
dst
.
cols
,
dst
.
elemSize1
(),
dst
.
channels
(),
src1
,
src2
,
dst
,
stream
);
}
void
bitwise_and_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
using
namespace
cv
::
gpu
;
typedef
void
(
*
Caller
)(
int
,
int
,
int
,
const
PtrStep
,
const
PtrStep
,
const
PtrStep
,
PtrStep
,
cudaStream_t
);
static
Caller
callers
[]
=
{
mathfunc
::
bitwise_mask_and_caller
<
unsigned
char
>
,
mathfunc
::
bitwise_mask_and_caller
<
unsigned
char
>
,
mathfunc
::
bitwise_mask_and_caller
<
unsigned
short
>
,
mathfunc
::
bitwise_mask_and_caller
<
unsigned
short
>
,
mathfunc
::
bitwise_mask_and_caller
<
unsigned
int
>
,
mathfunc
::
bitwise_mask_and_caller
<
unsigned
int
>
,
mathfunc
::
bitwise_mask_and_caller
<
unsigned
int
>
};
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
CV_Assert
(
mask
.
type
()
==
CV_8U
&&
mask
.
size
()
==
src1
.
size
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
bitwise_and_caller
(
dst
.
rows
,
dst
.
cols
,
src1
,
src2
,
dst
.
elemSize
(),
dst
,
mask
,
stream
);
Caller
caller
=
callers
[
src1
.
depth
()];
CV_Assert
(
caller
);
int
cn
=
dst
.
depth
()
!=
CV_64F
?
dst
.
channels
()
:
dst
.
channels
()
*
(
sizeof
(
double
)
/
sizeof
(
unsigned
int
));
caller
(
dst
.
rows
,
dst
.
cols
,
cn
,
src1
,
src2
,
mask
,
dst
,
stream
);
}
void
bitwise_xor_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
());
CV_Assert
(
src1
.
type
()
==
src2
.
type
());
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
bitwise_xor_caller
(
dst
.
rows
,
dst
.
cols
,
src1
,
src2
,
dst
.
elemSize
(),
dst
,
stream
);
cv
::
gpu
::
mathfunc
::
bitwise_xor_caller
(
dst
.
rows
,
dst
.
cols
,
dst
.
elemSize1
(),
dst
.
channels
(),
src1
,
src2
,
dst
,
stream
);
}
void
bitwise_xor_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
using
namespace
cv
::
gpu
;
typedef
void
(
*
Caller
)(
int
,
int
,
int
,
const
PtrStep
,
const
PtrStep
,
const
PtrStep
,
PtrStep
,
cudaStream_t
);
static
Caller
callers
[]
=
{
mathfunc
::
bitwise_mask_xor_caller
<
unsigned
char
>
,
mathfunc
::
bitwise_mask_xor_caller
<
unsigned
char
>
,
mathfunc
::
bitwise_mask_xor_caller
<
unsigned
short
>
,
mathfunc
::
bitwise_mask_xor_caller
<
unsigned
short
>
,
mathfunc
::
bitwise_mask_xor_caller
<
unsigned
int
>
,
mathfunc
::
bitwise_mask_xor_caller
<
unsigned
int
>
,
mathfunc
::
bitwise_mask_xor_caller
<
unsigned
int
>
};
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
CV_Assert
(
mask
.
type
()
==
CV_8U
&&
mask
.
size
()
==
src1
.
size
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
bitwise_xor_caller
(
dst
.
rows
,
dst
.
cols
,
src1
,
src2
,
dst
.
elemSize
(),
dst
,
mask
,
stream
);
Caller
caller
=
callers
[
src1
.
depth
()];
CV_Assert
(
caller
);
int
cn
=
dst
.
depth
()
!=
CV_64F
?
dst
.
channels
()
:
dst
.
channels
()
*
(
sizeof
(
double
)
/
sizeof
(
unsigned
int
));
caller
(
dst
.
rows
,
dst
.
cols
,
cn
,
src1
,
src2
,
mask
,
dst
,
stream
);
}
}
...
...
modules/gpu/src/cuda/mathfunc.cu
View file @
e62bf3a2
...
...
@@ -299,18 +299,12 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
struct UnOp<T, UN_OP_NOT>
{
typedef typename TypeVec<T, 2>::vec_t Vec2;
typedef typename TypeVec<T, 3>::vec_t Vec3;
typedef typename TypeVec<T, 4>::vec_t Vec4;
static __device__ T call(T v) { return ~v; }
static __device__ Vec2 call(Vec2 v) { return VecTraits<Vec2>::make(~v.x, ~v.y); }
static __device__ Vec3 call(Vec3 v) { return VecTraits<Vec3>::make(~v.x, ~v.y, ~v.z); }
static __device__ Vec4 call(Vec4 v) { return VecTraits<Vec4>::make(~v.x, ~v.y, ~v.z, ~v.w); }
};
template <int opid>
__global__ void bitwise_un_op
(int rows, int cols
, const PtrStep src, PtrStep dst)
__global__ void bitwise_un_op
_kernel(int rows, int width
, const PtrStep src, PtrStep dst)
{
const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -319,13 +313,13 @@ namespace cv { namespace gpu { namespace mathfunc
{
uchar* dst_ptr = dst.ptr(y) + x;
const uchar* src_ptr = src.ptr(y) + x;
if (x + sizeof(uint) - 1 <
cols
)
if (x + sizeof(uint) - 1 <
width
)
{
*(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
}
else
{
const uchar* src_end = src.ptr(y) +
cols
;
const uchar* src_end = src.ptr(y) +
width
;
while (src_ptr < src_end)
{
*dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
...
...
@@ -335,105 +329,65 @@ namespace cv { namespace gpu { namespace mathfunc
}
template <
typename T, int cn,
int opid>
__global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, const PtrStep mask
)
template <int opid>
void bitwise_un_op(int rows, int width, const PtrStep src, PtrStep dst, cudaStream_t stream
)
{
typedef typename TypeVec<T, cn>::vec_t Type
;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y
;
dim3 threads(16, 16)
;
dim3 grid(divUp(width, threads.x * sizeof(uint)),
divUp(rows, threads.y))
;
if (x < cols && y < rows && mask.ptr(y)[x])
{
Type* dst_row = (Type*)dst.ptr(y);
const Type* src_row = (const Type*)src.ptr(y);
dst_row[x] = UnOp<T, opid>::call(src_row[x]);
}
bitwise_un_op_kernel<opid><<<grid, threads>>>(rows, width, src, dst);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
template <typename T, int
cn, int
opid>
__global__ void bitwise_un_op_
two_loads(int rows, int cols, const PtrStep src, PtrStep dst, const PtrStep mask
)
template <typename T, int opid>
__global__ void bitwise_un_op_
kernel(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst
)
{
typedef typename TypeVec<T, cn>::vec_t Type;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < cols && y < rows && mask.ptr(y)[x])
if (x < cols && y < rows && mask.ptr(y)[x
/ cn
])
{
T
ype* dst_row = (Type
*)dst.ptr(y);
const T
ype* src_row = (const Type
*)src.ptr(y);
dst_row[2 * x] = UnOp<T, opid>::call(src_row[2 * x]);
dst_row[
2 * x + 1] = UnOp<T, opid>::call(src_row[2 * x + 1
]);
T
* dst_row = (T
*)dst.ptr(y);
const T
* src_row = (const T
*)src.ptr(y);
dst_row[
x] = UnOp<T, opid>::call(src_row[x
]);
}
}
template <int opid>
void bitwise_un_op(int rows, int cols,
const PtrStep src, PtrStep dst, int elem_size
, cudaStream_t stream)
template <
typename T,
int opid>
void bitwise_un_op(int rows, int cols,
int cn, const PtrStep src, const PtrStep mask, PtrStep dst
, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols * elem_size, threads.x * sizeof(uint)),
divUp(rows, threads.y));
bitwise_un_op<opid><<<grid, threads>>>(rows, cols * elem_size, src, dst);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
bitwise_un_op_kernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
template <int opid>
void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, const PtrStep mask, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
switch (elem_size)
{
case 1:
bitwise_un_op<uchar, 1, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 2:
bitwise_un_op<ushort, 1, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 3:
bitwise_un_op<uchar, 3, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 4:
bitwise_un_op<uint, 1, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 6:
bitwise_un_op<ushort, 3, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 8:
bitwise_un_op<uint, 2, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 12:
bitwise_un_op<uint, 3, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 16:
bitwise_un_op<uint, 4, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 24:
bitwise_un_op_two_loads<uint, 3, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
case 32:
bitwise_un_op_two_loads<uint, 4, opid><<<grid, threads>>>(rows, cols, src, dst, mask);
break;
}
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
void bitwise_not_caller(int rows, int cols,
const PtrStep src, int elem_size
, PtrStep dst, cudaStream_t stream)
void bitwise_not_caller(int rows, int cols,
int elem_size1, int cn, const PtrStep src
, PtrStep dst, cudaStream_t stream)
{
bitwise_un_op<UN_OP_NOT>(rows, cols
, src, dst, elem_size
, stream);
bitwise_un_op<UN_OP_NOT>(rows, cols
* elem_size1 * cn, src, dst
, stream);
}
void bitwise_not_caller(int rows, int cols,const PtrStep src, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream)
template <typename T>
void bitwise_mask_not_caller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream)
{
bitwise_un_op<
UN_OP_NOT>(rows, cols, src, dst, elem_size, mask
, stream);
bitwise_un_op<
T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst
, stream);
}
template void bitwise_mask_not_caller<uchar>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwise_mask_not_caller<ushort>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwise_mask_not_caller<uint>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
//------------------------------------------------------------------------
// Binary operations
...
...
@@ -445,43 +399,25 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
struct BinOp<T, BIN_OP_OR>
{
typedef typename TypeVec<T, 2>::vec_t Vec2;
typedef typename TypeVec<T, 3>::vec_t Vec3;
typedef typename TypeVec<T, 4>::vec_t Vec4;
static __device__ T call(T a, T b) { return a | b; }
static __device__ Vec2 call(Vec2 a, Vec2 b) { return VecTraits<Vec2>::make(a.x | b.x, a.y | b.y); }
static __device__ Vec3 call(Vec3 a, Vec3 b) { return VecTraits<Vec3>::make(a.x | b.x, a.y | b.y, a.z | b.z); }
static __device__ Vec4 call(Vec4 a, Vec4 b) { return VecTraits<Vec4>::make(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); }
};
template <typename T>
struct BinOp<T, BIN_OP_AND>
{
typedef typename TypeVec<T, 2>::vec_t Vec2;
typedef typename TypeVec<T, 3>::vec_t Vec3;
typedef typename TypeVec<T, 4>::vec_t Vec4;
static __device__ T call(T a, T b) { return a & b; }
static __device__ Vec2 call(Vec2 a, Vec2 b) { return VecTraits<Vec2>::make(a.x & b.x, a.y & b.y); }
static __device__ Vec3 call(Vec3 a, Vec3 b) { return VecTraits<Vec3>::make(a.x & b.x, a.y & b.y, a.z & b.z); }
static __device__ Vec4 call(Vec4 a, Vec4 b) { return VecTraits<Vec4>::make(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); }
};
template <typename T>
struct BinOp<T, BIN_OP_XOR>
{
typedef typename TypeVec<T, 2>::vec_t Vec2;
typedef typename TypeVec<T, 3>::vec_t Vec3;
typedef typename TypeVec<T, 4>::vec_t Vec4;
static __device__ T call(T a, T b) { return a ^ b; }
static __device__ Vec2 call(Vec2 a, Vec2 b) { return VecTraits<Vec2>::make(a.x ^ b.x, a.y ^ b.y); }
static __device__ Vec3 call(Vec3 a, Vec3 b) { return VecTraits<Vec3>::make(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z); }
static __device__ Vec4 call(Vec4 a, Vec4 b) { return VecTraits<Vec4>::make(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
};
template <int opid>
__global__ void bitwise_bin_op
(int rows, int cols
, const PtrStep src1, const PtrStep src2, PtrStep dst)
__global__ void bitwise_bin_op
_kernel(int rows, int width
, const PtrStep src1, const PtrStep src2, PtrStep dst)
{
const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -491,13 +427,14 @@ namespace cv { namespace gpu { namespace mathfunc
uchar* dst_ptr = dst.ptr(y) + x;
const uchar* src1_ptr = src1.ptr(y) + x;
const uchar* src2_ptr = src2.ptr(y) + x;
if (x + sizeof(uint) - 1 < cols)
if (x + sizeof(uint) - 1 < width)
{
*(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
}
else
{
const uchar* src1_end = src1.ptr(y) +
cols
;
const uchar* src1_end = src1.ptr(y) +
width
;
while (src1_ptr < src1_end)
{
*dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
...
...
@@ -507,135 +444,103 @@ namespace cv { namespace gpu { namespace mathfunc
}
template <
typename T, int cn,
int opid>
__global__ void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2
,
PtrStep dst, const PtrStep mask
)
template <int opid>
void bitwise_bin_op(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst
,
cudaStream_t stream
)
{
typedef typename TypeVec<T, cn>::vec_t Type;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
dim3 threads(16, 16);
dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
if (x < cols && y < rows && mask.ptr(y)[x])
{
Type* dst_row = (Type*)dst.ptr(y);
const Type* src1_row = (const Type*)src1.ptr(y);
const Type* src2_row = (const Type*)src2.ptr(y);
dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
}
bitwise_bin_op_kernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
template <typename T, int cn, int opid>
__global__ void bitwise_bin_op_two_loads(int rows, int cols, const PtrStep src1, const PtrStep src2,
PtrStep dst, const PtrStep mask)
template <typename T, int opid>
__global__ void bitwise_bin_op_kernel(
int rows, int cols, int cn, const PtrStep src1, const PtrStep src2,
const PtrStep mask, PtrStep dst)
{
typedef typename TypeVec<T, cn>::vec_t Type;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < cols && y < rows && mask.ptr(y)[x])
if (x < cols && y < rows && mask.ptr(y)[x
/ cn
])
{
T
ype* dst_row = (Type
*)dst.ptr(y);
const T
ype* src1_row = (const Type
*)src1.ptr(y);
const T
ype* src2_row = (const Type
*)src2.ptr(y);
dst_row[2 * x] = BinOp<T, opid>::call(src1_row[2 * x], src2_row[2 * x]);
dst_row[
2 * x + 1] = BinOp<T, opid>::call(src1_row[2 * x + 1], src2_row[2 * x + 1
]);
T
* dst_row = (T
*)dst.ptr(y);
const T
* src1_row = (const T
*)src1.ptr(y);
const T
* src2_row = (const T
*)src2.ptr(y);
dst_row[
x] = BinOp<T, opid>::call(src1_row[x], src2_row[x
]);
}
}
template <int opid>
void bitwise_bin_op(int rows, int cols,
const PtrStep src1, const PtrStep src2, PtrStep dst
,
int elem_size
, cudaStream_t stream)
template <
typename T,
int opid>
void bitwise_bin_op(int rows, int cols,
int cn, const PtrStep src1, const PtrStep src2
,
const PtrStep mask, PtrStep dst
, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols * elem_size, threads.x * sizeof(uint)),
divUp(rows, threads.y));
bitwise_bin_op<opid><<<grid, threads>>>(rows, cols * elem_size, src1, src2, dst);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
bitwise_bin_op_kernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
template <int opid>
void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst,
int elem_size, const PtrStep mask, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
switch (elem_size)
{
case 1:
bitwise_bin_op<uchar, 1, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 2:
bitwise_bin_op<ushort, 1, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 3:
bitwise_bin_op<uchar, 3, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 4:
bitwise_bin_op<uint, 1, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 6:
bitwise_bin_op<ushort, 3, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 8:
bitwise_bin_op<uint, 2, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 12:
bitwise_bin_op<uint, 3, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 16:
bitwise_bin_op<uint, 4, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 24:
bitwise_bin_op_two_loads<uint, 3, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
case 32:
bitwise_bin_op_two_loads<uint, 4, opid><<<grid, threads>>>(rows, cols, src1, src2, dst, mask);
break;
}
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
void bitwise_or_caller(int rows, int cols,
const PtrStep src1, const PtrStep src2, int elem_size
, PtrStep dst, cudaStream_t stream)
void bitwise_or_caller(int rows, int cols,
int elem_size1, int cn, const PtrStep src1, const PtrStep src2
, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_OR>(rows, cols
, src1, src2, dst, elem_size
, stream);
bitwise_bin_op<BIN_OP_OR>(rows, cols
* elem_size1 * cn, src1, src2, dst
, stream);
}
void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream)
template <typename T>
void bitwise_mask_or_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<
BIN_OP_OR>(rows, cols, src1, src2, dst, elem_size, mask
, stream);
bitwise_bin_op<
T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst
, stream);
}
template void bitwise_mask_or_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwise_mask_or_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwise_mask_or_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream)
void bitwise_and_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_AND>(rows, cols
, src1, src2, dst, elem_size
, stream);
bitwise_bin_op<BIN_OP_AND>(rows, cols
* elem_size1 * cn, src1, src2, dst
, stream);
}
void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream)
template <typename T>
void bitwise_mask_and_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<
BIN_OP_AND>(rows, cols, src1, src2, dst, elem_size, mask
, stream);
bitwise_bin_op<
T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst
, stream);
}
template void bitwise_mask_and_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwise_mask_and_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwise_mask_and_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
void bitwise_xor_caller(int rows, int cols,
const PtrStep src1, const PtrStep src2, int elem_size
, PtrStep dst, cudaStream_t stream)
void bitwise_xor_caller(int rows, int cols,
int elem_size1, int cn, const PtrStep src1, const PtrStep src2
, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<BIN_OP_XOR>(rows, cols
, src1, src2, dst, elem_size
, stream);
bitwise_bin_op<BIN_OP_XOR>(rows, cols
* elem_size1 * cn, src1, src2, dst
, stream);
}
void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream)
template <typename T>
void bitwise_mask_xor_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
{
bitwise_bin_op<
BIN_OP_XOR>(rows, cols, src1, src2, dst, elem_size, mask
, stream);
bitwise_bin_op<
T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst
, stream);
}
template void bitwise_mask_xor_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwise_mask_xor_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwise_mask_xor_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
//////////////////////////////////////////////////////////////////////////////
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment