Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
acac27d8
Commit
acac27d8
authored
Sep 12, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
optimized gpu::multiply
parent
6763bd6d
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
43 additions
and
53 deletions
+43
-53
element_operations.cu
modules/gpu/src/cuda/element_operations.cu
+34
-43
element_operations.cpp
modules/gpu/src/element_operations.cpp
+9
-10
No files found.
modules/gpu/src/cuda/element_operations.cu
View file @
acac27d8
...
...
@@ -607,68 +607,59 @@ namespace cv { namespace gpu { namespace device
//////////////////////////////////////////////////////////////////////////
// multiply
// TODO implement more efficient version
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols,
PtrStep dst)
struct multiply_8uc4_32f : binary_function<uint, float, uint>
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < cols && y < rows)
__device__ __forceinline__ uint operator ()(uint a, float b) const
{
((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]);
}
}
uint res = 0;
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a )) * b) );
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
return res;
}
};
multiplyKernel<TSrc1, TSrc2, TDst, cn><<<grid, threads>>>(src1, src2, rows, cols, dst);
cudaSafeCall(cudaGetLastError());
template <> struct TransformFunctorTraits<multiply_8uc4_32f> : DefaultTransformFunctorTraits<multiply_8uc4_32f>
{
enum { smart_block_dim_x = 8 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 8 };
};
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)
{
transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);
}
template void multiplyCaller<uchar, float, uchar, 4>(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// multiply (by scalar)
// TODO implement more efficient version
template <typename TSrc, typename TDst>
void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst)
template <typename T, typename D, typename S> struct MultiplyScalar : unary_function<T, D>
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
__host__ __device__ __forceinline__ MultiplyScalar(typename TypeTraits<S>::ParameterType scale_) : scale(scale_) {}
if (x < cols && y < rows)
__device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType a) const
{
((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc*)src1.ptr(y))[x]
* scale);
return saturate_cast<D>(a
* scale);
}
}
const S scale;
};
template <typename TSrc, typename TDst>
void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream)
template <> struct TransformFunctorTraits< MultiplyScalar<uchar, uchar, float> > : DefaultTransformFunctorTraits< MultiplyScalar<uchar, uchar, float> >
{
dim3 threads(32, 8);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
multiplyScalarKernel<TSrc, TDst><<<grid, threads>>>(src, scale, rows, cols, dst);
cudaSafeCall(cudaGetLastError());
enum { smart_block_dim_y = 8 };
enum { smart_shift = 8 };
};
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
template <typename T, typename D>
void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream)
{
transform(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), MultiplyScalar<T, D, float>(scale), stream);
}
template void multiplyScalarCaller<uchar, uchar>(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream);
template void multiplyScalar_gpu<uchar, uchar>(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream);
}}}
modules/gpu/src/element_operations.cpp
View file @
acac27d8
...
...
@@ -199,22 +199,21 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
TSrc1
,
typename
TSrc2
,
typename
TDst
,
int
cn
>
void
multiplyCaller
(
const
PtrStep
src1
,
const
PtrStep
src2
,
int
rows
,
int
cols
,
PtrStep
dst
,
cudaStream_t
stream
);
void
multiply_gpu
(
const
DevMem2D_
<
uchar4
>&
src1
,
const
DevMem2Df
&
src2
,
const
DevMem2D_
<
uchar4
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
Src
,
typename
TDst
>
void
multiplyScalar
Caller
(
const
PtrStep
src
,
float
scalar
,
int
rows
,
int
cols
,
PtrStep
dst
,
cudaStream_t
stream
);
template
<
typename
T
,
typename
D
>
void
multiplyScalar
_gpu
(
const
DevMem2D
&
src
,
float
scale
,
const
DevMem2D
&
dst
,
cudaStream_t
stream
);
}}}
void
cv
::
gpu
::
multiply
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
Stream
&
stream
)
{
if
(
src1
.
type
()
==
CV_8UC4
&&
src2
.
type
()
==
CV_32F
)
if
(
src1
.
type
()
==
CV_8UC4
&&
src2
.
type
()
==
CV_32F
C1
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
device
::
multiplyCaller
<
uchar
,
float
,
uchar
,
4
>
(
static_cast
<
DevMem2D
>
(
src1
),
static_cast
<
DevMem2D
>
(
src2
),
src1
.
rows
,
src1
.
cols
*
4
,
static_cast
<
DevMem2D
>
(
dst
),
StreamAccessor
::
getStream
(
stream
));
device
::
multiply_gpu
(
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
else
nppArithmCaller
(
src1
,
src2
,
dst
,
nppiMul_8u_C1RSfs
,
nppiMul_8u_C4RSfs
,
nppiMul_32s_C1R
,
nppiMul_32f_C1R
,
StreamAccessor
::
getStream
(
stream
));
...
...
@@ -225,8 +224,8 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream&
if
(
src
.
depth
()
==
CV_8U
)
{
dst
.
create
(
src
.
size
(),
src
.
type
());
device
::
multiplyScalarCaller
<
uchar
,
uchar
>
(
static_cast
<
DevMem2D
>
(
src
),
(
float
)(
sc
[
0
]),
src
.
rows
,
src
.
cols
*
src
.
channels
(),
static_cast
<
DevMem2D
>
(
dst
)
,
StreamAccessor
::
getStream
(
stream
));
device
::
multiplyScalar_gpu
<
uchar
,
uchar
>
(
src
.
reshape
(
1
),
(
float
)(
sc
[
0
]),
dst
,
StreamAccessor
::
getStream
(
stream
));
}
else
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment