Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
903f835d
Commit
903f835d
authored
Sep 10, 2011
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added support of 8UC4*32FC1 multiply into GPU module
parent
ab3ec788
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
116 additions
and
21 deletions
+116
-21
element_operations.cu
modules/gpu/src/cuda/element_operations.cu
+67
-0
element_operations.cpp
modules/gpu/src/element_operations.cpp
+49
-21
No files found.
modules/gpu/src/cuda/element_operations.cu
View file @
903f835d
...
...
@@ -602,4 +602,71 @@ namespace cv { namespace gpu { namespace device
template void pow_caller<ushort>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
template void pow_caller<int>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
template void pow_caller<float>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// multiply
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols,
PtrStep dst)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < cols && y < rows)
{
((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]);
}
}
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
multiplyKernel<TSrc1, TSrc2, TDst, cn><<<grid, threads>>>(src1, src2, rows, cols, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template void multiplyCaller<uchar, float, uchar, 4>(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// multiply (by scalar)
template <typename TSrc, typename TDst>
void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < cols && y < rows)
{
((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc*)src1.ptr(y))[x] * scale);
}
}
template <typename TSrc, typename TDst>
void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
multiplyScalarKernel<TSrc, TDst><<<grid, threads>>>(src, scale, rows, cols, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template void multiplyScalarCaller<uchar, uchar>(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream);
}}}
modules/gpu/src/element_operations.cpp
View file @
903f835d
...
...
@@ -197,11 +197,59 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
nppArithmCaller
(
src2
,
src1
,
dst
,
nppiSub_8u_C1RSfs
,
nppiSub_8u_C4RSfs
,
nppiSub_32s_C1R
,
nppiSub_32f_C1R
,
StreamAccessor
::
getStream
(
stream
));
}
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
TSrc1
,
typename
TSrc2
,
typename
TDst
,
int
cn
>
void
multiplyCaller
(
const
PtrStep
src1
,
const
PtrStep
src2
,
int
rows
,
int
cols
,
PtrStep
dst
,
cudaStream_t
stream
);
template
<
typename
TSrc
,
typename
TDst
>
void
multiplyScalarCaller
(
const
PtrStep
src
,
float
scalar
,
int
rows
,
int
cols
,
PtrStep
dst
,
cudaStream_t
stream
);
}}}
void
cv
::
gpu
::
multiply
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
Stream
&
stream
)
{
nppArithmCaller
(
src1
,
src2
,
dst
,
nppiMul_8u_C1RSfs
,
nppiMul_8u_C4RSfs
,
nppiMul_32s_C1R
,
nppiMul_32f_C1R
,
StreamAccessor
::
getStream
(
stream
));
if
(
src1
.
type
()
==
CV_8UC4
&&
src2
.
type
()
==
CV_32F
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
device
::
multiplyCaller
<
uchar
,
float
,
uchar
,
4
>
(
static_cast
<
DevMem2D
>
(
src1
),
static_cast
<
DevMem2D
>
(
src2
),
src1
.
rows
,
src1
.
cols
*
4
,
static_cast
<
DevMem2D
>
(
dst
),
StreamAccessor
::
getStream
(
stream
));
}
else
nppArithmCaller
(
src1
,
src2
,
dst
,
nppiMul_8u_C1RSfs
,
nppiMul_8u_C4RSfs
,
nppiMul_32s_C1R
,
nppiMul_32f_C1R
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
multiply
(
const
GpuMat
&
src
,
const
Scalar
&
sc
,
GpuMat
&
dst
,
Stream
&
stream
)
{
if
(
src
.
depth
()
==
CV_8U
)
{
dst
.
create
(
src
.
size
(),
src
.
type
());
device
::
multiplyScalarCaller
<
uchar
,
uchar
>
(
static_cast
<
DevMem2D
>
(
src
),
(
float
)(
sc
[
0
]),
src
.
rows
,
src
.
cols
*
src
.
channels
(),
static_cast
<
DevMem2D
>
(
dst
),
StreamAccessor
::
getStream
(
stream
));
}
else
{
CV_Assert
(
src
.
type
()
==
CV_32FC1
);
dst
.
create
(
src
.
size
(),
src
.
type
());
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
cudaStream_t
cudaStream
=
StreamAccessor
::
getStream
(
stream
);
NppStreamHandler
h
(
cudaStream
);
nppSafeCall
(
nppiMulC_32f_C1R
(
src
.
ptr
<
Npp32f
>
(),
static_cast
<
int
>
(
src
.
step
),
(
Npp32f
)
sc
[
0
],
dst
.
ptr
<
Npp32f
>
(),
static_cast
<
int
>
(
dst
.
step
),
sz
)
);
if
(
cudaStream
==
0
)
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
}
void
cv
::
gpu
::
divide
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
Stream
&
stream
)
{
nppArithmCaller
(
src2
,
src1
,
dst
,
nppiDiv_8u_C1RSfs
,
nppiDiv_8u_C4RSfs
,
nppiDiv_32s_C1R
,
nppiDiv_32f_C1R
,
StreamAccessor
::
getStream
(
stream
));
...
...
@@ -227,26 +275,6 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream&
callers
[
src
.
channels
()](
src
,
sc
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
multiply
(
const
GpuMat
&
src
,
const
Scalar
&
sc
,
GpuMat
&
dst
,
Stream
&
stream
)
{
CV_Assert
(
src
.
type
()
==
CV_32FC1
);
dst
.
create
(
src
.
size
(),
src
.
type
());
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
cudaStream_t
cudaStream
=
StreamAccessor
::
getStream
(
stream
);
NppStreamHandler
h
(
cudaStream
);
nppSafeCall
(
nppiMulC_32f_C1R
(
src
.
ptr
<
Npp32f
>
(),
static_cast
<
int
>
(
src
.
step
),
(
Npp32f
)
sc
[
0
],
dst
.
ptr
<
Npp32f
>
(),
static_cast
<
int
>
(
dst
.
step
),
sz
)
);
if
(
cudaStream
==
0
)
cudaSafeCall
(
cudaDeviceSynchronize
()
);
}
void
cv
::
gpu
::
divide
(
const
GpuMat
&
src
,
const
Scalar
&
sc
,
GpuMat
&
dst
,
Stream
&
stream
)
{
CV_Assert
(
src
.
type
()
==
CV_32FC1
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment