Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
acac27d8
Commit
acac27d8
authored
Sep 12, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
optimized gpu::multiply
parent
6763bd6d
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
43 additions
and
53 deletions
+43
-53
element_operations.cu
modules/gpu/src/cuda/element_operations.cu
+34
-43
element_operations.cpp
modules/gpu/src/element_operations.cpp
+9
-10
No files found.
modules/gpu/src/cuda/element_operations.cu
View file @
acac27d8
...
@@ -607,68 +607,59 @@ namespace cv { namespace gpu { namespace device
...
@@ -607,68 +607,59 @@ namespace cv { namespace gpu { namespace device
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
// multiply
// multiply
// TODO implement more efficient version
struct multiply_8uc4_32f : binary_function<uint, float, uint>
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols,
PtrStep dst)
{
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
__device__ __forceinline__ uint operator ()(uint a, float b) const
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < cols && y < rows)
{
{
((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]);
uint res = 0;
}
}
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a )) * b) );
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
return res;
void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream)
}
{
};
dim3 threads(32, 8);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
multiplyKernel<TSrc1, TSrc2, TDst, cn><<<grid, threads>>>(src1, src2, rows, cols, dst);
template <> struct TransformFunctorTraits<multiply_8uc4_32f> : DefaultTransformFunctorTraits<multiply_8uc4_32f>
cudaSafeCall(cudaGetLastError());
{
enum { smart_block_dim_x = 8 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 8 };
};
if (stream == 0)
void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)
cudaSafeCall(cudaDeviceSynchronize());
{
transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);
}
}
template void multiplyCaller<uchar, float, uchar, 4>(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
// multiply (by scalar)
// multiply (by scalar)
// TODO implement more efficient version
template <typename T, typename D, typename S> struct MultiplyScalar : unary_function<T, D>
template <typename TSrc, typename TDst>
void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst)
{
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
__host__ __device__ __forceinline__ MultiplyScalar(typename TypeTraits<S>::ParameterType scale_) : scale(scale_) {}
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < cols && y < rows)
__device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType a) const
{
{
((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc*)src1.ptr(y))[x]
* scale);
return saturate_cast<D>(a
* scale);
}
}
}
const S scale;
};
template <typename TSrc, typename TDst>
template <> struct TransformFunctorTraits< MultiplyScalar<uchar, uchar, float> > : DefaultTransformFunctorTraits< MultiplyScalar<uchar, uchar, float> >
void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream)
{
{
dim3 threads(32, 8);
enum { smart_block_dim_y = 8 };
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
enum { smart_shift = 8 };
};
multiplyScalarKernel<TSrc, TDst><<<grid, threads>>>(src, scale, rows, cols, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
template <typename T, typename D>
cudaSafeCall(cudaDeviceSynchronize());
void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream)
{
transform(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), MultiplyScalar<T, D, float>(scale), stream);
}
}
template void multiplyScalar_gpu<uchar, uchar>(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream);
template void multiplyScalarCaller<uchar, uchar>(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream);
}}}
}}}
modules/gpu/src/element_operations.cpp
View file @
acac27d8
...
@@ -199,22 +199,21 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
...
@@ -199,22 +199,21 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
cv
{
namespace
gpu
{
namespace
device
{
{
template
<
typename
TSrc1
,
typename
TSrc2
,
typename
TDst
,
int
cn
>
void
multiply_gpu
(
const
DevMem2D_
<
uchar4
>&
src1
,
const
DevMem2Df
&
src2
,
const
DevMem2D_
<
uchar4
>&
dst
,
cudaStream_t
stream
);
void
multiplyCaller
(
const
PtrStep
src1
,
const
PtrStep
src2
,
int
rows
,
int
cols
,
PtrStep
dst
,
cudaStream_t
stream
);
template
<
typename
T
Src
,
typename
TDst
>
template
<
typename
T
,
typename
D
>
void
multiplyScalar
Caller
(
const
PtrStep
src
,
float
scalar
,
int
rows
,
int
cols
,
PtrStep
dst
,
cudaStream_t
stream
);
void
multiplyScalar
_gpu
(
const
DevMem2D
&
src
,
float
scale
,
const
DevMem2D
&
dst
,
cudaStream_t
stream
);
}}}
}}}
void
cv
::
gpu
::
multiply
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
Stream
&
stream
)
void
cv
::
gpu
::
multiply
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
Stream
&
stream
)
{
{
if
(
src1
.
type
()
==
CV_8UC4
&&
src2
.
type
()
==
CV_32F
)
if
(
src1
.
type
()
==
CV_8UC4
&&
src2
.
type
()
==
CV_32F
C1
)
{
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
());
CV_Assert
(
src1
.
size
()
==
src2
.
size
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
device
::
multiplyCaller
<
uchar
,
float
,
uchar
,
4
>
(
static_cast
<
DevMem2D
>
(
src1
),
static_cast
<
DevMem2D
>
(
src2
),
src1
.
rows
,
src1
.
cols
*
4
,
static_cast
<
DevMem2D
>
(
dst
),
device
::
multiply_gpu
(
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
StreamAccessor
::
getStream
(
stream
));
}
}
else
else
nppArithmCaller
(
src1
,
src2
,
dst
,
nppiMul_8u_C1RSfs
,
nppiMul_8u_C4RSfs
,
nppiMul_32s_C1R
,
nppiMul_32f_C1R
,
StreamAccessor
::
getStream
(
stream
));
nppArithmCaller
(
src1
,
src2
,
dst
,
nppiMul_8u_C1RSfs
,
nppiMul_8u_C4RSfs
,
nppiMul_32s_C1R
,
nppiMul_32f_C1R
,
StreamAccessor
::
getStream
(
stream
));
...
@@ -225,8 +224,8 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream&
...
@@ -225,8 +224,8 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream&
if
(
src
.
depth
()
==
CV_8U
)
if
(
src
.
depth
()
==
CV_8U
)
{
{
dst
.
create
(
src
.
size
(),
src
.
type
());
dst
.
create
(
src
.
size
(),
src
.
type
());
device
::
multiplyScalarCaller
<
uchar
,
uchar
>
(
static_cast
<
DevMem2D
>
(
src
),
(
float
)(
sc
[
0
]),
src
.
rows
,
src
.
cols
*
src
.
channels
(),
static_cast
<
DevMem2D
>
(
dst
)
,
StreamAccessor
::
getStream
(
stream
));
device
::
multiplyScalar_gpu
<
uchar
,
uchar
>
(
src
.
reshape
(
1
),
(
float
)(
sc
[
0
]),
dst
,
StreamAccessor
::
getStream
(
stream
));
}
}
else
else
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment